X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/4452a7af2eac33dbad800bcc91f2399d62c18f53..2d21ac55c334faf3a56e5634905ed6987fc787d4:/osfmk/vm/vm_resident.c diff --git a/osfmk/vm/vm_resident.c b/osfmk/vm/vm_resident.c index 91b359f80..5d4d80b47 100644 --- a/osfmk/vm/vm_resident.c +++ b/osfmk/vm/vm_resident.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2007 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -63,10 +63,12 @@ */ #include +#include #include #include #include +#include #include #include #include @@ -86,13 +88,21 @@ #include /* (BRINGUP) */ #include +#include +#include + +#if CONFIG_EMBEDDED +#include +#endif + +int speculative_age_index = 0; +int speculative_steal_index = 0; + +struct vm_speculative_age_q vm_page_queue_speculative[VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1]; + +static void vm_page_insert_internal(vm_page_t, vm_object_t, vm_object_offset_t, boolean_t); -/* Variables used to indicate the relative age of pages in the - * inactive list - */ -unsigned int vm_page_ticket_roll = 0; -unsigned int vm_page_ticket = 0; /* * Associated with page of user-allocatable memory is a * page structure. @@ -128,12 +138,9 @@ vm_page_bucket_t *vm_page_buckets; /* Array of buckets */ unsigned int vm_page_bucket_count = 0; /* How big is array? */ unsigned int vm_page_hash_mask; /* Mask for hash function */ unsigned int vm_page_hash_shift; /* Shift for hash function */ -uint32_t vm_page_bucket_hash; /* Basic bucket hash */ +uint32_t vm_page_bucket_hash; /* Basic bucket hash */ decl_simple_lock_data(,vm_page_bucket_lock) -vm_page_t -vm_page_lookup_nohint(vm_object_t object, vm_object_offset_t offset); - #if MACH_PAGE_HASH_STATS /* This routine is only for debug. It is intended to be called by @@ -183,7 +190,7 @@ hash_debug(void) */ vm_size_t page_size = PAGE_SIZE; vm_size_t page_mask = PAGE_MASK; -int page_shift = PAGE_SHIFT; +int page_shift = PAGE_SHIFT; /* * Resident page structures are initialized from @@ -195,13 +202,21 @@ int page_shift = PAGE_SHIFT; */ struct vm_page vm_page_template; +vm_page_t vm_pages = VM_PAGE_NULL; +unsigned int vm_pages_count = 0; + /* * Resident pages that represent real memory - * are allocated from a free list. + * are allocated from a set of free lists, + * one per color. */ -vm_page_t vm_page_queue_free; +unsigned int vm_colors; +unsigned int vm_color_mask; /* mask is == (vm_colors-1) */ +unsigned int vm_cache_geometry_colors = 0; /* set by hw dependent code during startup */ +queue_head_t vm_page_queue_free[MAX_COLORS]; vm_page_t vm_page_queue_fictitious; unsigned int vm_page_free_wanted; +unsigned int vm_page_free_wanted_privileged; unsigned int vm_page_free_count; unsigned int vm_page_fictitious_count; @@ -228,6 +243,16 @@ unsigned int io_throttle_zero_fill; */ vm_offset_t vm_page_fictitious_addr = (vm_offset_t) -1; +/* + * Guard pages are not accessible so they don't + * need a physical address, but we need to enter + * one in the pmap. + * Let's make it recognizable and make sure that + * we don't use a real physical page with that + * physical address. + */ +vm_offset_t vm_page_guard_addr = (vm_offset_t) -2; + /* * Resident page structures are also chained on * queues that are used by the page replacement @@ -240,8 +265,12 @@ vm_offset_t vm_page_fictitious_addr = (vm_offset_t) -1; */ queue_head_t vm_page_queue_active; queue_head_t vm_page_queue_inactive; +queue_head_t vm_page_queue_zf; /* inactive memory queue for zero fill */ + unsigned int vm_page_active_count; unsigned int vm_page_inactive_count; +unsigned int vm_page_throttled_count; +unsigned int vm_page_speculative_count; unsigned int vm_page_wire_count; unsigned int vm_page_gobble_count = 0; unsigned int vm_page_wire_count_warning = 0; @@ -250,6 +279,10 @@ unsigned int vm_page_gobble_count_warning = 0; unsigned int vm_page_purgeable_count = 0; /* # of pages purgeable now */ uint64_t vm_page_purged_count = 0; /* total count of purged pages */ +unsigned int vm_page_speculative_recreated = 0; +unsigned int vm_page_speculative_created = 0; +unsigned int vm_page_speculative_used = 0; + ppnum_t vm_lopage_poolstart = 0; ppnum_t vm_lopage_poolend = 0; int vm_lopage_poolsize = 0; @@ -265,8 +298,9 @@ uint64_t max_valid_dma_address = 0xffffffffffffffffULL; unsigned int vm_page_free_target = 0; unsigned int vm_page_free_min = 0; unsigned int vm_page_inactive_target = 0; +unsigned int vm_page_inactive_min = 0; unsigned int vm_page_free_reserved = 0; -unsigned int vm_page_throttled_count = 0; +unsigned int vm_page_zfill_throttle_count = 0; /* * The VM system has a couple of heuristics for deciding @@ -300,6 +334,34 @@ vm_set_page_size(void) break; } + +/* Called once during statup, once the cache geometry is known. + */ +static void +vm_page_set_colors( void ) +{ + unsigned int n, override; + + if ( PE_parse_boot_arg("colors", &override) ) /* colors specified as a boot-arg? */ + n = override; + else if ( vm_cache_geometry_colors ) /* do we know what the cache geometry is? */ + n = vm_cache_geometry_colors; + else n = DEFAULT_COLORS; /* use default if all else fails */ + + if ( n == 0 ) + n = 1; + if ( n > MAX_COLORS ) + n = MAX_COLORS; + + /* the count must be a power of 2 */ + if ( ( n & (n - 1)) !=0 ) + panic("vm_page_set_colors"); + + vm_colors = n; + vm_color_mask = n - 1; +} + + /* * vm_page_bootstrap: * @@ -336,11 +398,14 @@ vm_page_bootstrap( m->listq.next = NULL; m->listq.prev = NULL; + m->speculative = FALSE; + m->throttled = FALSE; m->inactive = FALSE; m->active = FALSE; + m->no_cache = FALSE; m->laundry = FALSE; m->free = FALSE; - m->no_isync = TRUE; + m->pmapped = FALSE; m->reference = FALSE; m->pageout = FALSE; m->dump_cleaning = FALSE; @@ -357,18 +422,15 @@ vm_page_bootstrap( m->cleaning = FALSE; m->precious = FALSE; m->clustered = FALSE; - m->lock_supplied = FALSE; m->unusual = FALSE; m->restart = FALSE; m->zero_fill = FALSE; m->encrypted = FALSE; + m->encrypted_cleaning = FALSE; + m->deactivated = FALSE; m->phys_page = 0; /* reset later */ - m->page_lock = VM_PROT_NONE; - m->unlock_request = VM_PROT_NONE; - m->page_error = KERN_SUCCESS; - /* * Initialize the page queues. */ @@ -376,13 +438,44 @@ vm_page_bootstrap( mutex_init(&vm_page_queue_free_lock, 0); mutex_init(&vm_page_queue_lock, 0); - vm_page_queue_free = VM_PAGE_NULL; + mutex_init(&vm_purgeable_queue_lock, 0); + + for (i = 0; i < PURGEABLE_Q_TYPE_MAX; i++) { + int group; + + purgeable_queues[i].token_q_head = 0; + purgeable_queues[i].token_q_tail = 0; + for (group = 0; group < NUM_VOLATILE_GROUPS; group++) + queue_init(&purgeable_queues[i].objq[group]); + + purgeable_queues[i].type = i; + purgeable_queues[i].new_pages = 0; +#if MACH_ASSERT + purgeable_queues[i].debug_count_tokens = 0; + purgeable_queues[i].debug_count_objects = 0; +#endif + }; + + for (i = 0; i < MAX_COLORS; i++ ) + queue_init(&vm_page_queue_free[i]); + queue_init(&vm_lopage_queue_free); vm_page_queue_fictitious = VM_PAGE_NULL; queue_init(&vm_page_queue_active); queue_init(&vm_page_queue_inactive); + queue_init(&vm_page_queue_throttled); queue_init(&vm_page_queue_zf); + for ( i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++ ) { + queue_init(&vm_page_queue_speculative[i].age_q); + + vm_page_queue_speculative[i].age_ts.tv_sec = 0; + vm_page_queue_speculative[i].age_ts.tv_nsec = 0; + } vm_page_free_wanted = 0; + vm_page_free_wanted_privileged = 0; + + vm_page_set_colors(); + /* * Steal memory for the map and zone subsystems. @@ -472,10 +565,11 @@ vm_page_bootstrap( * all VM managed pages are "free", courtesy of pmap_startup. */ vm_page_wire_count = atop_64(max_mem) - vm_page_free_count; /* initial value */ - - printf("vm_page_bootstrap: %d free pages\n", vm_page_free_count); vm_page_free_count_minimum = vm_page_free_count; + printf("vm_page_bootstrap: %d free pages and %d wired pages\n", + vm_page_free_count, vm_page_wire_count); + simple_lock_init(&vm_paging_lock, 0); } @@ -558,7 +652,6 @@ pmap_startup( vm_offset_t *endp) { unsigned int i, npages, pages_initialized, fill, fillval; - vm_page_t pages; ppnum_t phys_page; addr64_t tmpaddr; unsigned int num_of_lopages = 0; @@ -571,9 +664,9 @@ pmap_startup( tmpaddr = (addr64_t)pmap_free_pages() * (addr64_t)PAGE_SIZE; /* Get the amount of memory left */ tmpaddr = tmpaddr + (addr64_t)(round_page_32(virtual_space_start) - virtual_space_start); /* Account for any slop */ - npages = (unsigned int)(tmpaddr / (addr64_t)(PAGE_SIZE + sizeof(*pages))); /* Figure size of all vm_page_ts, including enough to hold the vm_page_ts */ + npages = (unsigned int)(tmpaddr / (addr64_t)(PAGE_SIZE + sizeof(*vm_pages))); /* Figure size of all vm_page_ts, including enough to hold the vm_page_ts */ - pages = (vm_page_t) pmap_steal_memory(npages * sizeof *pages); + vm_pages = (vm_page_t) pmap_steal_memory(npages * sizeof *vm_pages); /* * Initialize the page frames. @@ -582,16 +675,18 @@ pmap_startup( if (!pmap_next_page(&phys_page)) break; - vm_page_init(&pages[i], phys_page); + vm_page_init(&vm_pages[i], phys_page); vm_page_pages++; pages_initialized++; } + vm_pages_count = pages_initialized; /* * Check if we want to initialize pages to a known value */ fill = 0; /* Assume no fill */ if (PE_parse_boot_arg("fill", &fillval)) fill = 1; /* Set fill */ + /* * if vm_lopage_poolsize is non-zero, than we need to reserve @@ -604,7 +699,7 @@ pmap_startup( for (i = 0, num_of_lopages = vm_lopage_poolsize; num_of_lopages && i < pages_initialized; num_of_lopages--, i++) { vm_page_t m; - m = &pages[i]; + m = &vm_pages[i]; if (m->phys_page >= (1 << (32 - PAGE_SHIFT))) panic("couldn't reserve the lopage pool: not enough lo pages\n"); @@ -632,8 +727,8 @@ pmap_startup( if (2 == vm_himemory_mode) { // free low -> high so high is preferred for (i = last_index + 1; i <= pages_initialized; i++) { - if(fill) fillPage(pages[i - 1].phys_page, fillval); /* Fill the page with a know value if requested at boot */ - vm_page_release(&pages[i - 1]); + if(fill) fillPage(vm_pages[i - 1].phys_page, fillval); /* Fill the page with a know value if requested at boot */ + vm_page_release(&vm_pages[i - 1]); } } else @@ -646,35 +741,42 @@ pmap_startup( * they require several consecutive pages. */ for (i = pages_initialized; i > last_index; i--) { - if(fill) fillPage(pages[i - 1].phys_page, fillval); /* Fill the page with a know value if requested at boot */ - vm_page_release(&pages[i - 1]); + if(fill) fillPage(vm_pages[i - 1].phys_page, fillval); /* Fill the page with a know value if requested at boot */ + vm_page_release(&vm_pages[i - 1]); } #if 0 { vm_page_t xx, xxo, xxl; - int j, k, l; + int i, j, k, l; j = 0; /* (BRINGUP) */ xxl = 0; - for(xx = vm_page_queue_free; xx; xxl = xx, xx = xx->pageq.next) { /* (BRINGUP) */ - j++; /* (BRINGUP) */ - if(j > vm_page_free_count) { /* (BRINGUP) */ - panic("pmap_startup: too many pages, xx = %08X, xxl = %08X\n", xx, xxl); - } - - l = vm_page_free_count - j; /* (BRINGUP) */ - k = 0; /* (BRINGUP) */ - - if(((j - 1) & 0xFFFF) == 0) kprintf("checking number %d of %d\n", j, vm_page_free_count); - - for(xxo = xx->pageq.next; xxo; xxo = xxo->pageq.next) { /* (BRINGUP) */ - k++; - if(k > l) panic("pmap_startup: too many in secondary check %d %d\n", k, l); - if((xx->phys_page & 0xFFFFFFFF) == (xxo->phys_page & 0xFFFFFFFF)) { /* (BRINGUP) */ - panic("pmap_startup: duplicate physaddr, xx = %08X, xxo = %08X\n", xx, xxo); + for( i = 0; i < vm_colors; i++ ) { + queue_iterate(&vm_page_queue_free[i], + xx, + vm_page_t, + pageq) { /* BRINGUP */ + j++; /* (BRINGUP) */ + if(j > vm_page_free_count) { /* (BRINGUP) */ + panic("pmap_startup: too many pages, xx = %08X, xxl = %08X\n", xx, xxl); + } + + l = vm_page_free_count - j; /* (BRINGUP) */ + k = 0; /* (BRINGUP) */ + + if(((j - 1) & 0xFFFF) == 0) kprintf("checking number %d of %d\n", j, vm_page_free_count); + + for(xxo = xx->pageq.next; xxo != &vm_page_queue_free[i]; xxo = xxo->pageq.next) { /* (BRINGUP) */ + k++; + if(k > l) panic("pmap_startup: too many in secondary check %d %d\n", k, l); + if((xx->phys_page & 0xFFFFFFFF) == (xxo->phys_page & 0xFFFFFFFF)) { /* (BRINGUP) */ + panic("pmap_startup: duplicate physaddr, xx = %08X, xxo = %08X\n", xx, xxo); + } } + + xxl = xx; } } @@ -768,6 +870,7 @@ vm_page_create( ( (natural_t)((uint32_t)object * vm_page_bucket_hash) + ((uint32_t)atop_64(offset) ^ vm_page_bucket_hash))\ & vm_page_hash_mask) + /* * vm_page_insert: [ internal use only ] * @@ -776,12 +879,22 @@ vm_page_create( * * The object must be locked. */ - void vm_page_insert( - register vm_page_t mem, - register vm_object_t object, - register vm_object_offset_t offset) + vm_page_t mem, + vm_object_t object, + vm_object_offset_t offset) +{ + vm_page_insert_internal(mem, object, offset, FALSE); +} + + +static void +vm_page_insert_internal( + vm_page_t mem, + vm_object_t object, + vm_object_offset_t offset, + boolean_t queues_lock_held) { register vm_page_bucket_t *bucket; @@ -790,9 +903,14 @@ vm_page_insert( (integer_t)object, (integer_t)offset, (integer_t)mem, 0,0); VM_PAGE_CHECK(mem); -#if DEBUG - _mutex_assert(&object->Lock, MA_OWNED); + if (object == vm_submap_object) { + /* the vm_submap_object is only a placeholder for submaps */ + panic("vm_page_insert(vm_submap_object,0x%llx)\n", offset); + } + + vm_object_lock_assert_exclusive(object); +#if DEBUG if (mem->tabled || mem->object != VM_OBJECT_NULL) panic("vm_page_insert: page %p for (obj=%p,off=0x%llx) " "already in (obj=%p,off=0x%llx)", @@ -840,11 +958,15 @@ vm_page_insert( object->resident_page_count++; - if (object->purgable == VM_OBJECT_PURGABLE_VOLATILE || - object->purgable == VM_OBJECT_PURGABLE_EMPTY) { - vm_page_lock_queues(); + if (object->purgable == VM_PURGABLE_VOLATILE || + object->purgable == VM_PURGABLE_EMPTY) { + if (queues_lock_held == FALSE) + vm_page_lockspin_queues(); + vm_page_purgeable_count++; - vm_page_unlock_queues(); + + if (queues_lock_held == FALSE) + vm_page_unlock_queues(); } } @@ -867,8 +989,8 @@ vm_page_replace( vm_page_t found_m = VM_PAGE_NULL; VM_PAGE_CHECK(mem); + vm_object_lock_assert_exclusive(object); #if DEBUG - _mutex_assert(&object->Lock, MA_OWNED); _mutex_assert(&vm_page_queue_lock, MA_OWNED); if (mem->tabled || mem->object != VM_OBJECT_NULL) @@ -932,8 +1054,8 @@ vm_page_replace( found_m->offset = (vm_object_offset_t) -1; object->resident_page_count--; - if (object->purgable == VM_OBJECT_PURGABLE_VOLATILE || - object->purgable == VM_OBJECT_PURGABLE_EMPTY) { + if (object->purgable == VM_PURGABLE_VOLATILE || + object->purgable == VM_PURGABLE_EMPTY) { assert(vm_page_purgeable_count > 0); vm_page_purgeable_count--; } @@ -958,8 +1080,8 @@ vm_page_replace( object->resident_page_count++; - if (object->purgable == VM_OBJECT_PURGABLE_VOLATILE || - object->purgable == VM_OBJECT_PURGABLE_EMPTY) { + if (object->purgable == VM_PURGABLE_VOLATILE || + object->purgable == VM_PURGABLE_EMPTY) { vm_page_purgeable_count++; } } @@ -986,8 +1108,8 @@ vm_page_remove( (integer_t)mem, 0,0); #if DEBUG _mutex_assert(&vm_page_queue_lock, MA_OWNED); - _mutex_assert(&mem->object->Lock, MA_OWNED); #endif + vm_object_lock_assert_exclusive(mem->object); assert(mem->tabled); assert(!mem->cleaning); VM_PAGE_CHECK(mem); @@ -1030,12 +1152,11 @@ vm_page_remove( mem->object->resident_page_count--; - if (mem->object->purgable == VM_OBJECT_PURGABLE_VOLATILE || - mem->object->purgable == VM_OBJECT_PURGABLE_EMPTY) { + if (mem->object->purgable == VM_PURGABLE_VOLATILE || + mem->object->purgable == VM_PURGABLE_EMPTY) { assert(vm_page_purgeable_count > 0); vm_page_purgeable_count--; } - mem->tabled = FALSE; mem->object = VM_OBJECT_NULL; mem->offset = (vm_object_offset_t) -1; @@ -1054,6 +1175,9 @@ unsigned long vm_page_lookup_hint = 0; unsigned long vm_page_lookup_hint_next = 0; unsigned long vm_page_lookup_hint_prev = 0; unsigned long vm_page_lookup_hint_miss = 0; +unsigned long vm_page_lookup_bucket_NULL = 0; +unsigned long vm_page_lookup_miss = 0; + vm_page_t vm_page_lookup( @@ -1063,23 +1187,25 @@ vm_page_lookup( register vm_page_t mem; register vm_page_bucket_t *bucket; queue_entry_t qe; -#if 0 - _mutex_assert(&object->Lock, MA_OWNED); -#endif + vm_object_lock_assert_held(object); mem = object->memq_hint; + if (mem != VM_PAGE_NULL) { assert(mem->object == object); + if (mem->offset == offset) { vm_page_lookup_hint++; return mem; } qe = queue_next(&mem->listq); + if (! queue_end(&object->memq, qe)) { vm_page_t next_page; next_page = (vm_page_t) qe; assert(next_page->object == object); + if (next_page->offset == offset) { vm_page_lookup_hint_next++; object->memq_hint = next_page; /* new hint */ @@ -1087,11 +1213,13 @@ vm_page_lookup( } } qe = queue_prev(&mem->listq); + if (! queue_end(&object->memq, qe)) { vm_page_t prev_page; prev_page = (vm_page_t) qe; assert(prev_page->object == object); + if (prev_page->offset == offset) { vm_page_lookup_hint_prev++; object->memq_hint = prev_page; /* new hint */ @@ -1099,24 +1227,24 @@ vm_page_lookup( } } } - /* - * Search the hash table for this object/offset pair + * Search the hash table for this object/offset pair */ - bucket = &vm_page_buckets[vm_page_hash(object, offset)]; - /* - * since we hold the object lock, we are guaranteed that no - * new pages can be inserted into this object... this in turn - * guarantess that the page we're looking for can't exist - * if the bucket it hashes to is currently NULL even when looked - * at outside the scope of the hash bucket lock... this is a - * really cheap optimiztion to avoid taking the lock - */ - if (bucket->pages == VM_PAGE_NULL) { - return (VM_PAGE_NULL); - } + /* + * since we hold the object lock, we are guaranteed that no + * new pages can be inserted into this object... this in turn + * guarantess that the page we're looking for can't exist + * if the bucket it hashes to is currently NULL even when looked + * at outside the scope of the hash bucket lock... this is a + * really cheap optimiztion to avoid taking the lock + */ + if (bucket->pages == VM_PAGE_NULL) { + vm_page_lookup_bucket_NULL++; + + return (VM_PAGE_NULL); + } simple_lock(&vm_page_bucket_lock); for (mem = bucket->pages; mem != VM_PAGE_NULL; mem = mem->next) { @@ -1132,40 +1260,13 @@ vm_page_lookup( } assert(mem->object == object); object->memq_hint = mem; - } + } else + vm_page_lookup_miss++; return(mem); } -vm_page_t -vm_page_lookup_nohint( - vm_object_t object, - vm_object_offset_t offset) -{ - register vm_page_t mem; - register vm_page_bucket_t *bucket; - -#if 0 - _mutex_assert(&object->Lock, MA_OWNED); -#endif - /* - * Search the hash table for this object/offset pair - */ - - bucket = &vm_page_buckets[vm_page_hash(object, offset)]; - - simple_lock(&vm_page_bucket_lock); - for (mem = bucket->pages; mem != VM_PAGE_NULL; mem = mem->next) { - VM_PAGE_CHECK(mem); - if ((mem->object == object) && (mem->offset == offset)) - break; - } - simple_unlock(&vm_page_bucket_lock); - - return(mem); -} - /* * vm_page_rename: * @@ -1178,19 +1279,28 @@ void vm_page_rename( register vm_page_t mem, register vm_object_t new_object, - vm_object_offset_t new_offset) + vm_object_offset_t new_offset, + boolean_t encrypted_ok) { assert(mem->object != new_object); + /* * ENCRYPTED SWAP: * The encryption key is based on the page's memory object * (aka "pager") and paging offset. Moving the page to * another VM object changes its "pager" and "paging_offset" - * so it has to be decrypted first. + * so it has to be decrypted first, or we would lose the key. + * + * One exception is VM object collapsing, where we transfer pages + * from one backing object to its parent object. This operation also + * transfers the paging information, so the info + * should remain consistent. The caller (vm_object_do_collapse()) + * sets "encrypted_ok" in this case. */ - if (mem->encrypted) { + if (!encrypted_ok && mem->encrypted) { panic("vm_page_rename: page %p is encrypted\n", mem); } + /* * Changes to mem->object require the page lock because * the pageout daemon uses that lock to get the object. @@ -1201,7 +1311,7 @@ vm_page_rename( (integer_t)new_object, (integer_t)new_offset, (integer_t)mem, 0,0); - vm_page_lock_queues(); + vm_page_lockspin_queues(); vm_page_remove(mem); vm_page_insert(mem, new_object, new_offset); vm_page_unlock_queues(); @@ -1234,14 +1344,17 @@ int c_vm_page_grab_fictitious = 0; int c_vm_page_release_fictitious = 0; int c_vm_page_more_fictitious = 0; +extern vm_page_t vm_page_grab_fictitious_common(vm_offset_t phys_addr); + vm_page_t -vm_page_grab_fictitious(void) +vm_page_grab_fictitious_common( + vm_offset_t phys_addr) { register vm_page_t m; m = (vm_page_t)zget(vm_page_zone); if (m) { - vm_page_init(m, vm_page_fictitious_addr); + vm_page_init(m, phys_addr); m->fictitious = TRUE; } @@ -1249,6 +1362,18 @@ vm_page_grab_fictitious(void) return m; } +vm_page_t +vm_page_grab_fictitious(void) +{ + return vm_page_grab_fictitious_common(vm_page_fictitious_addr); +} + +vm_page_t +vm_page_grab_guard(void) +{ + return vm_page_grab_fictitious_common(vm_page_guard_addr); +} + /* * vm_page_release_fictitious: * @@ -1262,7 +1387,8 @@ vm_page_release_fictitious( assert(!m->free); assert(m->busy); assert(m->fictitious); - assert(m->phys_page == vm_page_fictitious_addr); + assert(m->phys_page == vm_page_fictitious_addr || + m->phys_page == vm_page_guard_addr); c_vm_page_release_fictitious++; #if DEBUG @@ -1368,43 +1494,6 @@ void vm_page_more_fictitious(void) mutex_unlock(&vm_page_alloc_lock); } -/* - * vm_page_convert: - * - * Attempt to convert a fictitious page into a real page. - */ - -boolean_t -vm_page_convert( - register vm_page_t m) -{ - register vm_page_t real_m; - - assert(m->busy); - assert(m->fictitious); - assert(!m->dirty); - - real_m = vm_page_grab(); - if (real_m == VM_PAGE_NULL) - return FALSE; - - m->phys_page = real_m->phys_page; - m->fictitious = FALSE; - m->no_isync = TRUE; - - vm_page_lock_queues(); - if (m->active) - vm_page_active_count++; - else if (m->inactive) - vm_page_inactive_count++; - vm_page_unlock_queues(); - - real_m->phys_page = vm_page_fictitious_addr; - real_m->fictitious = TRUE; - - vm_page_release_fictitious(real_m); - return TRUE; -} /* * vm_pool_low(): @@ -1436,7 +1525,7 @@ int vm_himemory_mode = 0; */ unsigned int vm_lopage_free_count = 0; unsigned int vm_lopage_max_count = 0; -vm_page_t vm_lopage_queue_free = VM_PAGE_NULL; +queue_head_t vm_lopage_queue_free; vm_page_t vm_page_grablo(void) @@ -1449,18 +1538,25 @@ vm_page_grablo(void) mutex_lock(&vm_page_queue_free_lock); - if ((mem = vm_lopage_queue_free) != VM_PAGE_NULL) { + if (! queue_empty(&vm_lopage_queue_free)) { + queue_remove_first(&vm_lopage_queue_free, + mem, + vm_page_t, + pageq); + assert(mem->free); + assert(mem->busy); + assert(!mem->pmapped); - vm_lopage_queue_free = (vm_page_t) mem->pageq.next; mem->pageq.next = NULL; mem->pageq.prev = NULL; mem->free = FALSE; - mem->no_isync = TRUE; vm_lopage_free_count--; vm_lopage_alloc_count = (vm_lopage_poolend - vm_lopage_poolstart) - vm_lopage_free_count; if (vm_lopage_alloc_count > vm_lopage_max_count) vm_lopage_max_count = vm_lopage_alloc_count; + } else { + mem = VM_PAGE_NULL; } mutex_unlock(&vm_page_queue_free_lock); @@ -1468,23 +1564,62 @@ vm_page_grablo(void) } - /* * vm_page_grab: * - * Remove a page from the free list. - * Returns VM_PAGE_NULL if the free list is too small. + * first try to grab a page from the per-cpu free list... + * this must be done while pre-emption is disabled... if + * a page is available, we're done... + * if no page is available, grab the vm_page_queue_free_lock + * and see if current number of free pages would allow us + * to grab at least 1... if not, return VM_PAGE_NULL as before... + * if there are pages available, disable preemption and + * recheck the state of the per-cpu free list... we could + * have been preempted and moved to a different cpu, or + * some other thread could have re-filled it... if still + * empty, figure out how many pages we can steal from the + * global free queue and move to the per-cpu queue... + * return 1 of these pages when done... only wakeup the + * pageout_scan thread if we moved pages from the global + * list... no need for the wakeup if we've satisfied the + * request from the per-cpu queue. */ -unsigned long vm_page_grab_count = 0; /* measure demand */ +#define COLOR_GROUPS_TO_STEAL 4 + vm_page_t -vm_page_grab(void) +vm_page_grab( void ) { - register vm_page_t mem; + vm_page_t mem; + + + disable_preemption(); + + if ((mem = PROCESSOR_DATA(current_processor(), free_pages))) { +return_page_from_cpu_list: + PROCESSOR_DATA(current_processor(), page_grab_count) += 1; + PROCESSOR_DATA(current_processor(), free_pages) = mem->pageq.next; + mem->pageq.next = NULL; + + enable_preemption(); + + assert(mem->listq.next == NULL && mem->listq.prev == NULL); + assert(mem->tabled == FALSE); + assert(mem->object == VM_OBJECT_NULL); + assert(!mem->laundry); + assert(!mem->free); + assert(pmap_verify_free(mem->phys_page)); + assert(mem->busy); + assert(!mem->encrypted); + assert(!mem->pmapped); + + return mem; + } + enable_preemption(); + mutex_lock(&vm_page_queue_free_lock); - vm_page_grab_count++; /* * Optionally produce warnings if the wire or gobble @@ -1507,36 +1642,102 @@ vm_page_grab(void) * Only let privileged threads (involved in pageout) * dip into the reserved pool. */ - if ((vm_page_free_count < vm_page_free_reserved) && !(current_thread()->options & TH_OPT_VMPRIV)) { mutex_unlock(&vm_page_queue_free_lock); mem = VM_PAGE_NULL; - goto wakeup_pageout; } + else { + vm_page_t head; + vm_page_t tail; + unsigned int pages_to_steal; + unsigned int color; - while (vm_page_queue_free == VM_PAGE_NULL) { - mutex_unlock(&vm_page_queue_free_lock); - VM_PAGE_WAIT(); - mutex_lock(&vm_page_queue_free_lock); - } - - if (--vm_page_free_count < vm_page_free_count_minimum) - vm_page_free_count_minimum = vm_page_free_count; - mem = vm_page_queue_free; - vm_page_queue_free = (vm_page_t) mem->pageq.next; - mem->pageq.next = NULL; - mem->pageq.prev = NULL; - assert(mem->listq.next == NULL && mem->listq.prev == NULL); - assert(mem->tabled == FALSE); - assert(mem->object == VM_OBJECT_NULL); - assert(!mem->laundry); - mem->free = FALSE; - mem->no_isync = TRUE; - mutex_unlock(&vm_page_queue_free_lock); + while ( vm_page_free_count == 0 ) { + + mutex_unlock(&vm_page_queue_free_lock); + /* + * must be a privileged thread to be + * in this state since a non-privileged + * thread would have bailed if we were + * under the vm_page_free_reserved mark + */ + VM_PAGE_WAIT(); + mutex_lock(&vm_page_queue_free_lock); + } + + disable_preemption(); + + if ((mem = PROCESSOR_DATA(current_processor(), free_pages))) { + mutex_unlock(&vm_page_queue_free_lock); + + /* + * we got preempted and moved to another processor + * or we got preempted and someone else ran and filled the cache + */ + goto return_page_from_cpu_list; + } + if (vm_page_free_count <= vm_page_free_reserved) + pages_to_steal = 1; + else { + pages_to_steal = COLOR_GROUPS_TO_STEAL * vm_colors; + + if (pages_to_steal > (vm_page_free_count - vm_page_free_reserved)) + pages_to_steal = (vm_page_free_count - vm_page_free_reserved); + } + color = PROCESSOR_DATA(current_processor(), start_color); + head = tail = NULL; + + while (pages_to_steal--) { + if (--vm_page_free_count < vm_page_free_count_minimum) + vm_page_free_count_minimum = vm_page_free_count; + + while (queue_empty(&vm_page_queue_free[color])) + color = (color + 1) & vm_color_mask; + + queue_remove_first(&vm_page_queue_free[color], + mem, + vm_page_t, + pageq); + mem->pageq.next = NULL; + mem->pageq.prev = NULL; + + color = (color + 1) & vm_color_mask; + + if (head == NULL) + head = mem; + else + tail->pageq.next = (queue_t)mem; + tail = mem; + + mem->pageq.prev = NULL; + assert(mem->listq.next == NULL && mem->listq.prev == NULL); + assert(mem->tabled == FALSE); + assert(mem->object == VM_OBJECT_NULL); + assert(!mem->laundry); + assert(mem->free); + mem->free = FALSE; + + assert(pmap_verify_free(mem->phys_page)); + assert(mem->busy); + assert(!mem->free); + assert(!mem->encrypted); + assert(!mem->pmapped); + } + PROCESSOR_DATA(current_processor(), free_pages) = head->pageq.next; + PROCESSOR_DATA(current_processor(), start_color) = color; + + /* + * satisfy this request + */ + PROCESSOR_DATA(current_processor(), page_grab_count) += 1; + mem = head; + mem->pageq.next = NULL; - assert(pmap_verify_free(mem->phys_page)); + mutex_unlock(&vm_page_queue_free_lock); + enable_preemption(); + } /* * Decide if we should poke the pageout daemon. * We do this if the free count is less than the low @@ -1547,12 +1748,29 @@ vm_page_grab(void) * We don't have the counts locked ... if they change a little, * it doesn't really matter. */ - -wakeup_pageout: if ((vm_page_free_count < vm_page_free_min) || ((vm_page_free_count < vm_page_free_target) && - (vm_page_inactive_count < vm_page_inactive_target))) - thread_wakeup((event_t) &vm_page_free_wanted); + ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_min))) + thread_wakeup((event_t) &vm_page_free_wanted); + +#if CONFIG_EMBEDDED + { + int percent_avail; + + /* + * Decide if we need to poke the memorystatus notification thread. + */ + percent_avail = + (vm_page_active_count + vm_page_inactive_count + + vm_page_speculative_count + vm_page_free_count + + vm_page_purgeable_count ) * 100 / + atop_64(max_mem); + if (percent_avail <= (kern_memorystatus_level - 5)) { + kern_memorystatus_level = percent_avail; + thread_wakeup((event_t)&kern_memorystatus_wakeup); + } + } +#endif // dbgLog(mem->phys_page, vm_page_free_count, vm_page_wire_count, 4); /* (TEST/DEBUG) */ @@ -1569,7 +1787,7 @@ void vm_page_release( register vm_page_t mem) { - + unsigned int color; #if 0 unsigned int pindex; phys_entry *physent; @@ -1590,23 +1808,32 @@ vm_page_release( panic("vm_page_release"); #endif mem->free = TRUE; + + assert(mem->busy); assert(!mem->laundry); assert(mem->object == VM_OBJECT_NULL); assert(mem->pageq.next == NULL && mem->pageq.prev == NULL); - + assert(mem->listq.next == NULL && + mem->listq.prev == NULL); + if (mem->phys_page <= vm_lopage_poolend && mem->phys_page >= vm_lopage_poolstart) { /* * this exists to support hardware controllers * incapable of generating DMAs with more than 32 bits * of address on platforms with physical memory > 4G... */ - mem->pageq.next = (queue_entry_t) vm_lopage_queue_free; - vm_lopage_queue_free = mem; + queue_enter_first(&vm_lopage_queue_free, + mem, + vm_page_t, + pageq); vm_lopage_free_count++; } else { - mem->pageq.next = (queue_entry_t) vm_page_queue_free; - vm_page_queue_free = mem; + color = mem->phys_page & vm_color_mask; + queue_enter_first(&vm_page_queue_free[color], + mem, + vm_page_t, + pageq); vm_page_free_count++; /* * Check if we should wake up someone waiting for page. @@ -1628,13 +1855,36 @@ vm_page_release( * as long as vm_page_free_wanted is non-zero. */ - if ((vm_page_free_wanted > 0) && - (vm_page_free_count >= vm_page_free_reserved)) { + if ((vm_page_free_wanted_privileged > 0) && vm_page_free_count) { + vm_page_free_wanted_privileged--; + thread_wakeup_one((event_t) &vm_page_free_wanted_privileged); + } else if ((vm_page_free_wanted > 0) && + (vm_page_free_count >= vm_page_free_reserved)) { vm_page_free_wanted--; thread_wakeup_one((event_t) &vm_page_free_count); } } mutex_unlock(&vm_page_queue_free_lock); + +#if CONFIG_EMBEDDED + { + int percent_avail; + + /* + * Decide if we need to poke the memorystatus notification thread. + * Locking is not a big issue, as only a single thread delivers these. + */ + percent_avail = + (vm_page_active_count + vm_page_inactive_count + + vm_page_speculative_count + vm_page_free_count + + vm_page_purgeable_count ) * 100 / + atop_64(max_mem); + if (percent_avail >= (kern_memorystatus_level + 5)) { + kern_memorystatus_level = percent_avail; + thread_wakeup((event_t)&kern_memorystatus_wakeup); + } + } +#endif } /* @@ -1661,12 +1911,25 @@ vm_page_wait( */ kern_return_t wait_result; int need_wakeup = 0; + int is_privileged = current_thread()->options & TH_OPT_VMPRIV; mutex_lock(&vm_page_queue_free_lock); + + if (is_privileged && vm_page_free_count) { + mutex_unlock(&vm_page_queue_free_lock); + return TRUE; + } if (vm_page_free_count < vm_page_free_target) { - if (vm_page_free_wanted++ == 0) - need_wakeup = 1; - wait_result = assert_wait((event_t)&vm_page_free_count, interruptible); + + if (is_privileged) { + if (vm_page_free_wanted_privileged++ == 0) + need_wakeup = 1; + wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, interruptible); + } else { + if (vm_page_free_wanted++ == 0) + need_wakeup = 1; + wait_result = assert_wait((event_t)&vm_page_free_count, interruptible); + } mutex_unlock(&vm_page_queue_free_lock); counter(c_vm_page_wait_block++); @@ -1699,9 +1962,7 @@ vm_page_alloc( { register vm_page_t mem; -#if DEBUG - _mutex_assert(&object->Lock, MA_OWNED); -#endif + vm_object_lock_assert_exclusive(object); mem = vm_page_grab(); if (mem == VM_PAGE_NULL) return VM_PAGE_NULL; @@ -1711,7 +1972,6 @@ vm_page_alloc( return(mem); } - vm_page_t vm_page_alloclo( vm_object_t object, @@ -1719,9 +1979,7 @@ vm_page_alloclo( { register vm_page_t mem; -#if DEBUG - _mutex_assert(&object->Lock, MA_OWNED); -#endif + vm_object_lock_assert_exclusive(object); mem = vm_page_grablo(); if (mem == VM_PAGE_NULL) return VM_PAGE_NULL; @@ -1732,9 +1990,34 @@ vm_page_alloclo( } +/* + * vm_page_alloc_guard: + * + * Allocate a ficticious page which will be used + * as a guard page. The page will be inserted into + * the object and returned to the caller. + */ + +vm_page_t +vm_page_alloc_guard( + vm_object_t object, + vm_object_offset_t offset) +{ + register vm_page_t mem; + + vm_object_lock_assert_exclusive(object); + mem = vm_page_grab_guard(); + if (mem == VM_PAGE_NULL) + return VM_PAGE_NULL; + + vm_page_insert(mem, object, offset); + + return(mem); +} + + counter(unsigned int c_laundry_pages_freed = 0;) -int vm_pagein_cluster_unused = 0; boolean_t vm_page_free_verify = TRUE; /* * vm_page_free: @@ -1745,34 +2028,42 @@ boolean_t vm_page_free_verify = TRUE; * Object and page queues must be locked prior to entry. */ void -vm_page_free( +vm_page_free_prepare( register vm_page_t mem) { - vm_object_t object = mem->object; - + VM_PAGE_CHECK(mem); assert(!mem->free); assert(!mem->cleaning); assert(!mem->pageout); + +#if DEBUG if (vm_page_free_verify && !mem->fictitious && !mem->private) { assert(pmap_verify_free(mem->phys_page)); } - -#if DEBUG if (mem->object) - _mutex_assert(&mem->object->Lock, MA_OWNED); + vm_object_lock_assert_exclusive(mem->object); _mutex_assert(&vm_page_queue_lock, MA_OWNED); if (mem->free) panic("vm_page_free: freeing page on free list\n"); #endif + + if (mem->laundry) { + /* + * We may have to free a page while it's being laundered + * if we lost its pager (due to a forced unmount, for example). + * We need to call vm_pageout_throttle_up() before removing + * the page from its VM object, so that we can find out on + * which pageout queue the page is. + */ + vm_pageout_throttle_up(mem); + counter(++c_laundry_pages_freed); + } + if (mem->tabled) vm_page_remove(mem); /* clears tabled, object, offset */ - VM_PAGE_QUEUES_REMOVE(mem); /* clears active or inactive */ - if (mem->clustered) { - mem->clustered = FALSE; - vm_pagein_cluster_unused++; - } + VM_PAGE_QUEUES_REMOVE(mem); /* clears active/inactive/throttled/speculative */ if (mem->wire_count) { if (!mem->private && !mem->fictitious) @@ -1786,19 +2077,9 @@ vm_page_free( } mem->gobbled = FALSE; - if (mem->laundry) { - vm_pageout_throttle_up(mem); - counter(++c_laundry_pages_freed); - } - PAGE_WAKEUP(mem); /* clears wanted */ - if (mem->absent) - vm_object_absent_release(object); - /* Some of these may be unnecessary */ - mem->page_lock = 0; - mem->unlock_request = 0; mem->busy = TRUE; mem->absent = FALSE; mem->error = FALSE; @@ -1806,37 +2087,61 @@ vm_page_free( mem->precious = FALSE; mem->reference = FALSE; mem->encrypted = FALSE; - - mem->page_error = KERN_SUCCESS; + mem->encrypted_cleaning = FALSE; + mem->deactivated = FALSE; + mem->pmapped = FALSE; if (mem->private) { mem->private = FALSE; mem->fictitious = TRUE; mem->phys_page = vm_page_fictitious_addr; } - if (mem->fictitious) { - vm_page_release_fictitious(mem); - } else { - /* depends on the queues lock */ - if(mem->zero_fill) { - vm_zf_count-=1; + if (!mem->fictitious) { + if (mem->zero_fill == TRUE) { mem->zero_fill = FALSE; + OSAddAtomic(-1, (SInt32 *)&vm_zf_count); } vm_page_init(mem, mem->phys_page); - vm_page_release(mem); } } +void +vm_page_free( + vm_page_t mem) +{ + vm_page_free_prepare(mem); + if (mem->fictitious) { + vm_page_release_fictitious(mem); + } else { + vm_page_release(mem); + } +} +/* + * Free a list of pages. The list can be up to several hundred pages, + * as blocked up by vm_pageout_scan(). + * The big win is not having to take the page q and free list locks once + * per page. We sort the incoming pages into n lists, one for + * each color. + * + * The page queues must be locked, and are kept locked. + */ void vm_page_free_list( - register vm_page_t mem) + vm_page_t mem) { - register vm_page_t nxt; - register vm_page_t first = NULL; - register vm_page_t last = VM_PAGE_NULL; - register int pg_count = 0; + vm_page_t nxt; + int pg_count = 0; + int color; + int inuse_list_head = -1; + + queue_head_t free_list[MAX_COLORS]; + int inuse[MAX_COLORS]; + for (color = 0; color < (signed) vm_colors; color++) { + queue_init(&free_list[color]); + } + #if DEBUG _mutex_assert(&vm_page_queue_lock, MA_OWNED); #endif @@ -1844,63 +2149,84 @@ vm_page_free_list( #if DEBUG if (mem->tabled || mem->object) panic("vm_page_free_list: freeing tabled page\n"); - if (mem->inactive || mem->active || mem->free) + if (mem->inactive || mem->active || mem->throttled || mem->free) panic("vm_page_free_list: freeing page on list\n"); + if (vm_page_free_verify && !mem->fictitious && !mem->private) { + assert(pmap_verify_free(mem->phys_page)); + } #endif assert(mem->pageq.prev == NULL); + assert(mem->busy); + assert(!mem->free); nxt = (vm_page_t)(mem->pageq.next); - if (mem->clustered) - vm_pagein_cluster_unused++; - - if (mem->laundry) { - vm_pageout_throttle_up(mem); - counter(++c_laundry_pages_freed); - } - mem->busy = TRUE; - - PAGE_WAKEUP(mem); /* clears wanted */ - - if (mem->private) - mem->fictitious = TRUE; - if (!mem->fictitious) { - /* depends on the queues lock */ - if (mem->zero_fill) - vm_zf_count -= 1; - assert(!mem->laundry); - vm_page_init(mem, mem->phys_page); - mem->free = TRUE; - if (first == NULL) - last = mem; - mem->pageq.next = (queue_t) first; - first = mem; - + color = mem->phys_page & vm_color_mask; + if (queue_empty(&free_list[color])) { + inuse[color] = inuse_list_head; + inuse_list_head = color; + } + queue_enter_first(&free_list[color], + mem, + vm_page_t, + pageq); pg_count++; } else { - mem->phys_page = vm_page_fictitious_addr; + assert(mem->phys_page == vm_page_fictitious_addr || + mem->phys_page == vm_page_guard_addr); vm_page_release_fictitious(mem); } mem = nxt; } - if (first) { - + if (pg_count) { + unsigned int avail_free_count; + mutex_lock(&vm_page_queue_free_lock); - last->pageq.next = (queue_entry_t) vm_page_queue_free; - vm_page_queue_free = first; + color = inuse_list_head; + + while( color != -1 ) { + vm_page_t first, last; + vm_page_t first_free; + + first = (vm_page_t) queue_first(&free_list[color]); + last = (vm_page_t) queue_last(&free_list[color]); + first_free = (vm_page_t) queue_first(&vm_page_queue_free[color]); + if (queue_empty(&vm_page_queue_free[color])) { + queue_last(&vm_page_queue_free[color]) = + (queue_entry_t) last; + } else { + queue_prev(&first_free->pageq) = + (queue_entry_t) last; + } + queue_first(&vm_page_queue_free[color]) = + (queue_entry_t) first; + queue_prev(&first->pageq) = + (queue_entry_t) &vm_page_queue_free[color]; + queue_next(&last->pageq) = + (queue_entry_t) first_free; + color = inuse[color]; + } + vm_page_free_count += pg_count; + avail_free_count = vm_page_free_count; + + while ((vm_page_free_wanted_privileged > 0) && avail_free_count) { + vm_page_free_wanted_privileged--; + avail_free_count--; + + thread_wakeup_one((event_t) &vm_page_free_wanted_privileged); + } if ((vm_page_free_wanted > 0) && - (vm_page_free_count >= vm_page_free_reserved)) { + (avail_free_count >= vm_page_free_reserved)) { unsigned int available_pages; - if (vm_page_free_count >= vm_page_free_reserved) { - available_pages = (vm_page_free_count - - vm_page_free_reserved); + if (avail_free_count >= vm_page_free_reserved) { + available_pages = (avail_free_count - vm_page_free_reserved); } else { available_pages = 0; } @@ -1916,6 +2242,25 @@ vm_page_free_list( } } mutex_unlock(&vm_page_queue_free_lock); + +#if CONFIG_EMBEDDED + { + int percent_avail; + + /* + * Decide if we need to poke the memorystatus notification thread. + */ + percent_avail = + (vm_page_active_count + vm_page_inactive_count + + vm_page_speculative_count + vm_page_free_count + + vm_page_purgeable_count ) * 100 / + atop_64(max_mem); + if (percent_avail >= (kern_memorystatus_level + 5)) { + kern_memorystatus_level = percent_avail; + thread_wakeup((event_t)&kern_memorystatus_wakeup); + } + } +#endif } } @@ -1939,7 +2284,7 @@ vm_page_wire( VM_PAGE_CHECK(mem); #if DEBUG if (mem->object) - _mutex_assert(&mem->object->Lock, MA_OWNED); + vm_object_lock_assert_exclusive(mem->object); _mutex_assert(&vm_page_queue_lock, MA_OWNED); #endif if (mem->wire_count == 0) { @@ -1949,10 +2294,9 @@ vm_page_wire( if (mem->gobbled) vm_page_gobble_count--; mem->gobbled = FALSE; - if(mem->zero_fill) { - /* depends on the queues lock */ - vm_zf_count-=1; + if (mem->zero_fill == TRUE) { mem->zero_fill = FALSE; + OSAddAtomic(-1, (SInt32 *)&vm_zf_count); } /* * ENCRYPTED SWAP: @@ -1979,7 +2323,7 @@ void vm_page_gobble( register vm_page_t mem) { - vm_page_lock_queues(); + vm_page_lockspin_queues(); VM_PAGE_CHECK(mem); assert(!mem->gobbled); @@ -2013,7 +2357,7 @@ vm_page_unwire( assert(mem->wire_count > 0); #if DEBUG if (mem->object) - _mutex_assert(&mem->object->Lock, MA_OWNED); + vm_object_lock_assert_exclusive(mem->object); _mutex_assert(&vm_page_queue_lock, MA_OWNED); #endif if (--mem->wire_count == 0) { @@ -2022,13 +2366,23 @@ vm_page_unwire( assert(!mem->laundry); assert(mem->object != kernel_object); assert(mem->pageq.next == NULL && mem->pageq.prev == NULL); - queue_enter(&vm_page_queue_active, mem, vm_page_t, pageq); - vm_page_active_count++; - mem->active = TRUE; + if (!IP_VALID(memory_manager_default) && + mem->dirty && mem->object->internal && + (mem->object->purgable == VM_PURGABLE_DENY || + mem->object->purgable == VM_PURGABLE_NONVOLATILE)) { + queue_enter(&vm_page_queue_throttled, mem, vm_page_t, pageq); + vm_page_throttled_count++; + mem->throttled = TRUE; + } else { + queue_enter(&vm_page_queue_active, mem, vm_page_t, pageq); + vm_page_active_count++; + mem->active = TRUE; + } mem->reference = TRUE; } } + /* * vm_page_deactivate: * @@ -2042,8 +2396,11 @@ void vm_page_deactivate( register vm_page_t m) { + boolean_t rapid_age = FALSE; + VM_PAGE_CHECK(m); assert(m->object != kernel_object); + assert(m->phys_page != vm_page_guard_addr); // dbgLog(m->phys_page, vm_page_free_count, vm_page_wire_count, 6); /* (TEST/DEBUG) */ #if DEBUG @@ -2058,6 +2415,7 @@ vm_page_deactivate( */ if (m->gobbled) { /* can this happen? */ assert(m->wire_count == 0); + if (!m->private && !m->fictitious) vm_page_wire_count--; vm_page_gobble_count--; @@ -2065,36 +2423,51 @@ vm_page_deactivate( } if (m->private || (m->wire_count != 0)) return; - if (m->active || (m->inactive && m->reference)) { - if (!m->fictitious && !m->absent) - pmap_clear_reference(m->phys_page); - m->reference = FALSE; - VM_PAGE_QUEUES_REMOVE(m); + + if (m->active && m->deactivated == TRUE) { + if (!pmap_is_referenced(m->phys_page)) + rapid_age = TRUE; } - if (m->wire_count == 0 && !m->inactive) { - m->page_ticket = vm_page_ticket; - vm_page_ticket_roll++; + if (rapid_age == FALSE && !m->fictitious && !m->absent) + pmap_clear_reference(m->phys_page); + + m->reference = FALSE; + m->deactivated = FALSE; + m->no_cache = FALSE; + + if (!m->inactive) { + VM_PAGE_QUEUES_REMOVE(m); - if(vm_page_ticket_roll == VM_PAGE_TICKETS_IN_ROLL) { - vm_page_ticket_roll = 0; - if(vm_page_ticket == VM_PAGE_TICKET_ROLL_IDS) - vm_page_ticket= 0; - else - vm_page_ticket++; - } - assert(!m->laundry); assert(m->pageq.next == NULL && m->pageq.prev == NULL); - if(m->zero_fill) { - queue_enter(&vm_page_queue_zf, m, vm_page_t, pageq); + + if (!IP_VALID(memory_manager_default) && + m->dirty && m->object->internal && + (m->object->purgable == VM_PURGABLE_DENY || + m->object->purgable == VM_PURGABLE_NONVOLATILE)) { + queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq); + m->throttled = TRUE; + vm_page_throttled_count++; } else { - queue_enter(&vm_page_queue_inactive, - m, vm_page_t, pageq); + if (rapid_age == TRUE || + (!m->fictitious && m->object->named && m->object->ref_count == 1)) { + vm_page_speculate(m, FALSE); + vm_page_speculative_recreated++; + return; + } else { + if (m->zero_fill) { + queue_enter(&vm_page_queue_zf, m, vm_page_t, pageq); + vm_zf_queue_count++; + } else { + queue_enter(&vm_page_queue_inactive, m, vm_page_t, pageq); + } + } + m->inactive = TRUE; + if (!m->fictitious) { + vm_page_inactive_count++; + token_new_pagecount++; + } } - - m->inactive = TRUE; - if (!m->fictitious) - vm_page_inactive_count++; } } @@ -2111,7 +2484,10 @@ vm_page_activate( register vm_page_t m) { VM_PAGE_CHECK(m); +#ifdef FIXME_4778297 assert(m->object != kernel_object); +#endif + assert(m->phys_page != vm_page_guard_addr); #if DEBUG _mutex_assert(&vm_page_queue_lock, MA_OWNED); #endif @@ -2125,35 +2501,190 @@ vm_page_activate( if (m->private) return; - if (m->inactive) { +#if DEBUG + if (m->active) + panic("vm_page_activate: already active"); +#endif + + if (m->speculative) { + DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL); + DTRACE_VM2(pgfrec, int, 1, (uint64_t *), NULL); + } + + VM_PAGE_QUEUES_REMOVE(m); + + if (m->wire_count == 0) { assert(!m->laundry); - if (m->zero_fill) { - queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq); + assert(m->pageq.next == NULL && m->pageq.prev == NULL); + if (!IP_VALID(memory_manager_default) && + !m->fictitious && m->dirty && m->object->internal && + (m->object->purgable == VM_PURGABLE_DENY || + m->object->purgable == VM_PURGABLE_NONVOLATILE)) { + queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq); + m->throttled = TRUE; + vm_page_throttled_count++; } else { - queue_remove(&vm_page_queue_inactive, - m, vm_page_t, pageq); + queue_enter(&vm_page_queue_active, m, vm_page_t, pageq); + m->active = TRUE; + if (!m->fictitious) + vm_page_active_count++; } - m->pageq.next = NULL; - m->pageq.prev = NULL; - if (!m->fictitious) - vm_page_inactive_count--; - m->inactive = FALSE; + m->reference = TRUE; + m->no_cache = FALSE; } - if (m->wire_count == 0) { +} + + +/* + * vm_page_speculate: + * + * Put the specified page on the speculative list (if appropriate). + * + * The page queues must be locked. + */ +void +vm_page_speculate( + vm_page_t m, + boolean_t new) +{ + struct vm_speculative_age_q *aq; + + VM_PAGE_CHECK(m); + assert(m->object != kernel_object); + assert(!m->speculative && !m->active && !m->inactive && !m->throttled); + assert(m->phys_page != vm_page_guard_addr); + assert(m->pageq.next == NULL && m->pageq.prev == NULL); #if DEBUG - if (m->active) - panic("vm_page_activate: already active"); + _mutex_assert(&vm_page_queue_lock, MA_OWNED); #endif - assert(!m->laundry); - assert(m->pageq.next == NULL && m->pageq.prev == NULL); - queue_enter(&vm_page_queue_active, m, vm_page_t, pageq); - m->active = TRUE; - m->reference = TRUE; - if (!m->fictitious) - vm_page_active_count++; + if (m->wire_count == 0) { + mach_timespec_t ts; + + clock_get_system_nanotime(&ts.tv_sec, (unsigned *)&ts.tv_nsec); + + if (vm_page_speculative_count == 0) { + + speculative_age_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q; + speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q; + + aq = &vm_page_queue_speculative[speculative_age_index]; + + /* + * set the timer to begin a new group + */ + aq->age_ts.tv_sec = VM_PAGE_SPECULATIVE_Q_AGE_MS / 1000; + aq->age_ts.tv_nsec = (VM_PAGE_SPECULATIVE_Q_AGE_MS % 1000) * 1000 * NSEC_PER_USEC; + + ADD_MACH_TIMESPEC(&aq->age_ts, &ts); + } else { + aq = &vm_page_queue_speculative[speculative_age_index]; + + if (CMP_MACH_TIMESPEC(&ts, &aq->age_ts) >= 0) { + + speculative_age_index++; + + if (speculative_age_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) + speculative_age_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q; + if (speculative_age_index == speculative_steal_index) { + speculative_steal_index = speculative_age_index + 1; + + if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) + speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q; + } + aq = &vm_page_queue_speculative[speculative_age_index]; + + if (!queue_empty(&aq->age_q)) + vm_page_speculate_ageit(aq); + + aq->age_ts.tv_sec = VM_PAGE_SPECULATIVE_Q_AGE_MS / 1000; + aq->age_ts.tv_nsec = (VM_PAGE_SPECULATIVE_Q_AGE_MS % 1000) * 1000 * NSEC_PER_USEC; + + ADD_MACH_TIMESPEC(&aq->age_ts, &ts); + } + } + enqueue_tail(&aq->age_q, &m->pageq); + m->speculative = TRUE; + vm_page_speculative_count++; + + if (new == TRUE) { + m->object->pages_created++; + vm_page_speculative_created++; + } + } +} + + +/* + * move pages from the specified aging bin to + * the speculative bin that pageout_scan claims from + * + * The page queues must be locked. + */ +void +vm_page_speculate_ageit(struct vm_speculative_age_q *aq) +{ + struct vm_speculative_age_q *sq; + vm_page_t t; + + sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q]; + + if (queue_empty(&sq->age_q)) { + sq->age_q.next = aq->age_q.next; + sq->age_q.prev = aq->age_q.prev; + + t = (vm_page_t)sq->age_q.next; + t->pageq.prev = &sq->age_q; + + t = (vm_page_t)sq->age_q.prev; + t->pageq.next = &sq->age_q; + } else { + t = (vm_page_t)sq->age_q.prev; + t->pageq.next = aq->age_q.next; + + t = (vm_page_t)aq->age_q.next; + t->pageq.prev = sq->age_q.prev; + + t = (vm_page_t)aq->age_q.prev; + t->pageq.next = &sq->age_q; + + sq->age_q.prev = aq->age_q.prev; } + queue_init(&aq->age_q); +} + + +void +vm_page_lru( + vm_page_t m) +{ + VM_PAGE_CHECK(m); + assert(m->object != kernel_object); + assert(m->phys_page != vm_page_guard_addr); + +#if DEBUG + _mutex_assert(&vm_page_queue_lock, MA_OWNED); +#endif + if (m->active || m->reference) + return; + + if (m->private || (m->wire_count != 0)) + return; + + m->no_cache = FALSE; + + VM_PAGE_QUEUES_REMOVE(m); + + assert(!m->laundry); + assert(m->pageq.next == NULL && m->pageq.prev == NULL); + + queue_enter(&vm_page_queue_inactive, m, vm_page_t, pageq); + m->inactive = TRUE; + + vm_page_inactive_count++; + token_new_pagecount++; } + /* * vm_page_part_zero_fill: * @@ -2245,6 +2776,9 @@ vm_page_part_copy( * make sure the page is decrypted first, if necessary. */ +int vm_page_copy_cs_validations = 0; +int vm_page_copy_cs_tainted = 0; + void vm_page_copy( vm_page_t src_m, @@ -2270,32 +2804,38 @@ vm_page_copy( } dest_m->encrypted = FALSE; + if (src_m->object != VM_OBJECT_NULL && + src_m->object->code_signed && + !src_m->cs_validated) { + /* + * We're copying a not-yet-validated page from a + * code-signed object. + * Whoever ends up mapping the copy page might care about + * the original page's integrity, so let's validate the + * source page now. + */ + vm_page_copy_cs_validations++; + vm_page_validate_cs(src_m); + } + /* + * Propagate the code-signing bits to the copy page. + */ + dest_m->cs_validated = src_m->cs_validated; + dest_m->cs_tainted = src_m->cs_tainted; + if (dest_m->cs_tainted) { + assert(dest_m->cs_validated); + vm_page_copy_cs_tainted++; + } + pmap_copy_page(src_m->phys_page, dest_m->phys_page); } -/* - * Currently, this is a primitive allocator that grabs - * free pages from the system, sorts them by physical - * address, then searches for a region large enough to - * satisfy the user's request. - * - * Additional levels of effort: - * + steal clean active/inactive pages - * + force pageouts of dirty pages - * + maintain a map of available physical - * memory - */ - -#if MACH_ASSERT +#if MACH_ASSERT /* * Check that the list of pages is ordered by * ascending physical address and has no holes. */ -int vm_page_verify_contiguous( - vm_page_t pages, - unsigned int npages); - -int +static int vm_page_verify_contiguous( vm_page_t pages, unsigned int npages) @@ -2308,16 +2848,16 @@ vm_page_verify_contiguous( page_count = 1; for (m = NEXT_PAGE(pages); m != VM_PAGE_NULL; m = NEXT_PAGE(m)) { if (m->phys_page != prev_addr + 1) { - printf("m 0x%x prev_addr 0x%x, current addr 0x%x\n", + printf("m %p prev_addr 0x%x, current addr 0x%x\n", m, prev_addr, m->phys_page); - printf("pages 0x%x page_count %d\n", pages, page_count); + printf("pages %p page_count %d\n", pages, page_count); panic("vm_page_verify_contiguous: not contiguous!"); } prev_addr = m->phys_page; ++page_count; } if (page_count != npages) { - printf("pages 0x%x actual count 0x%x but requested 0x%x\n", + printf("pages %p actual count 0x%x but requested 0x%x\n", pages, page_count, npages); panic("vm_page_verify_contiguous: count error"); } @@ -2326,213 +2866,522 @@ vm_page_verify_contiguous( #endif /* MACH_ASSERT */ -cpm_counter(unsigned int vpfls_pages_handled = 0;) -cpm_counter(unsigned int vpfls_head_insertions = 0;) -cpm_counter(unsigned int vpfls_tail_insertions = 0;) -cpm_counter(unsigned int vpfls_general_insertions = 0;) -cpm_counter(unsigned int vpfc_failed = 0;) -cpm_counter(unsigned int vpfc_satisfied = 0;) +#if MACH_ASSERT +/* + * Check the free lists for proper length etc. + */ +static void +vm_page_verify_free_lists( void ) +{ + unsigned int color, npages; + vm_page_t m; + vm_page_t prev_m; + + npages = 0; + + mutex_lock(&vm_page_queue_free_lock); + + for( color = 0; color < vm_colors; color++ ) { + prev_m = (vm_page_t) &vm_page_queue_free[color]; + queue_iterate(&vm_page_queue_free[color], + m, + vm_page_t, + pageq) { + if ((vm_page_t) m->pageq.prev != prev_m) + panic("vm_page_verify_free_lists: corrupted prev ptr"); + if ( ! m->free ) + panic("vm_page_verify_free_lists: not free"); + if ( ! m->busy ) + panic("vm_page_verify_free_lists: not busy"); + if ( (m->phys_page & vm_color_mask) != color) + panic("vm_page_verify_free_lists: wrong color"); + ++npages; + prev_m = m; + } + } + if (npages != vm_page_free_count) + panic("vm_page_verify_free_lists: npages %u free_count %d", + npages, vm_page_free_count); + + mutex_unlock(&vm_page_queue_free_lock); +} +#endif /* MACH_ASSERT */ + + /* - * Find a region large enough to contain at least npages + * CONTIGUOUS PAGE ALLOCATION + * Additional levels of effort: + * + consider pages that are currently 'pmapped' + * this could be expensive since we'd have + * to ask the pmap layer about there state + * + consider dirty pages + * either clean them or + * copy them to other locations... + * + * Find a region large enough to contain at least n pages * of contiguous physical memory. * + * This is done by traversing the vm_page_t array in a linear fashion + * we assume that the vm_page_t array has the avaiable physical pages in an + * ordered, ascending list... this is currently true of all our implementations + * and must remain so... there can be 'holes' in the array... we also can + * no longer tolerate the vm_page_t's in the list being 'freed' and reclaimed + * which use to happen via 'vm_page_convert'... that function was no longer + * being called and was removed... + * + * The basic flow consists of stabilizing some of the interesting state of + * a vm_page_t behind the vm_page_queue and vm_page_free locks... we start our + * sweep at the beginning of the array looking for pages that meet our criterea + * for a 'stealable' page... currently we are pretty conservative... if the page + * meets this criterea and is physically contiguous to the previous page in the 'run' + * we keep developing it. If we hit a page that doesn't fit, we reset our state + * and start to develop a new run... if at this point we've already considered + * at least MAX_CONSIDERED_BEFORE_YIELD pages, we'll drop the 2 locks we hold, + * and mutex_pause (which will yield the processor), to keep the latency low w/r + * to other threads trying to acquire free pages (or move pages from q to q), + * and then continue from the spot we left off... we only make 1 pass through the + * array. Once we have a 'run' that is long enough, we'll go into the loop which + * which steals the pages from the queues they're currently on... pages on the free + * queue can be stolen directly... pages that are on any of the other queues + * must be removed from the object they are tabled on... this requires taking the + * object lock... we do this as a 'try' to prevent deadlocks... if the 'try' fails + * or if the state of the page behind the vm_object lock is no longer viable, we'll + * dump the pages we've currently stolen back to the free list, and pick up our + * scan from the point where we aborted the 'current' run. + * + * * Requirements: - * - Called while holding vm_page_queue_free_lock. - * - Doesn't respect vm_page_free_reserved; caller - * must not ask for more pages than are legal to grab. + * - neither vm_page_queue nor vm_free_list lock can be held on entry * - * Returns a pointer to a list of gobbled pages or VM_PAGE_NULL. + * Returns a pointer to a list of gobbled/wired pages or VM_PAGE_NULL. * * Algorithm: - * Loop over the free list, extracting one page at a time and - * inserting those into a sorted sub-list. We stop as soon as - * there's a contiguous range within the sorted list that can - * satisfy the contiguous memory request. This contiguous sub- - * list is chopped out of the sorted sub-list and the remainder - * of the sorted sub-list is put back onto the beginning of the - * free list. */ + +#define MAX_CONSIDERED_BEFORE_YIELD 1000 + + +#define RESET_STATE_OF_RUN() \ + MACRO_BEGIN \ + prevcontaddr = -2; \ + free_considered = 0; \ + substitute_needed = 0; \ + npages = 0; \ + MACRO_END + + static vm_page_t vm_page_find_contiguous( - unsigned int contig_pages) + unsigned int contig_pages, + ppnum_t max_pnum, + boolean_t wire) { - vm_page_t sort_list; - vm_page_t *contfirstprev, contlast; - vm_page_t m, m1; + vm_page_t m = NULL; ppnum_t prevcontaddr; - ppnum_t nextcontaddr; - unsigned int npages; - - m = NULL; -#if DEBUG - _mutex_assert(&vm_page_queue_free_lock, MA_OWNED); + unsigned int npages, considered; + unsigned int page_idx, start_idx; + int free_considered, free_available; + int substitute_needed; +#if MACH_ASSERT + uint32_t tv_start_sec, tv_start_usec, tv_end_sec, tv_end_usec; + int yielded = 0; + int dumped_run = 0; + int stolen_pages = 0; #endif -#if MACH_ASSERT - /* - * Verify pages in the free list.. - */ - npages = 0; - for (m = vm_page_queue_free; m != VM_PAGE_NULL; m = NEXT_PAGE(m)) - ++npages; - if (npages != vm_page_free_count) - panic("vm_sort_free_list: prelim: npages %u free_count %d", - npages, vm_page_free_count); -#endif /* MACH_ASSERT */ - if (contig_pages == 0 || vm_page_queue_free == VM_PAGE_NULL) + if (contig_pages == 0) return VM_PAGE_NULL; -#define PPNUM_PREV(x) (((x) > 0) ? ((x) - 1) : 0) -#define PPNUM_NEXT(x) (((x) < PPNUM_MAX) ? ((x) + 1) : PPNUM_MAX) -#define SET_NEXT_PAGE(m,n) ((m)->pageq.next = (struct queue_entry *) (n)) +#if MACH_ASSERT + vm_page_verify_free_lists(); - npages = 1; - contfirstprev = &sort_list; - contlast = sort_list = vm_page_queue_free; - vm_page_queue_free = NEXT_PAGE(sort_list); - SET_NEXT_PAGE(sort_list, VM_PAGE_NULL); - prevcontaddr = PPNUM_PREV(sort_list->phys_page); - nextcontaddr = PPNUM_NEXT(sort_list->phys_page); + clock_get_system_microtime(&tv_start_sec, &tv_start_usec); +#endif + vm_page_lock_queues(); + mutex_lock(&vm_page_queue_free_lock); - while (npages < contig_pages && - (m = vm_page_queue_free) != VM_PAGE_NULL) - { - cpm_counter(++vpfls_pages_handled); + RESET_STATE_OF_RUN(); - /* prepend to existing run? */ - if (m->phys_page == prevcontaddr) - { - vm_page_queue_free = NEXT_PAGE(m); - cpm_counter(++vpfls_head_insertions); - prevcontaddr = PPNUM_PREV(prevcontaddr); - SET_NEXT_PAGE(m, *contfirstprev); - *contfirstprev = m; - npages++; - continue; /* no tail expansion check needed */ - } - - /* append to tail of existing run? */ - else if (m->phys_page == nextcontaddr) - { - vm_page_queue_free = NEXT_PAGE(m); - cpm_counter(++vpfls_tail_insertions); - nextcontaddr = PPNUM_NEXT(nextcontaddr); - SET_NEXT_PAGE(m, NEXT_PAGE(contlast)); - SET_NEXT_PAGE(contlast, m); - contlast = m; - npages++; - } + considered = 0; + free_available = vm_page_free_count - vm_page_free_reserved; - /* prepend to the very front of sorted list? */ - else if (m->phys_page < sort_list->phys_page) - { - vm_page_queue_free = NEXT_PAGE(m); - cpm_counter(++vpfls_general_insertions); - prevcontaddr = PPNUM_PREV(m->phys_page); - nextcontaddr = PPNUM_NEXT(m->phys_page); - SET_NEXT_PAGE(m, sort_list); - contfirstprev = &sort_list; - contlast = sort_list = m; - npages = 1; + for (page_idx = 0, start_idx = 0; + npages < contig_pages && page_idx < vm_pages_count; + page_idx++) { +retry: + m = &vm_pages[page_idx]; + + if (max_pnum && m->phys_page > max_pnum) { + /* no more low pages... */ + break; } + if (m->phys_page <= vm_lopage_poolend && + m->phys_page >= vm_lopage_poolstart) { + /* + * don't want to take pages from our + * reserved pool of low memory + * so don't consider it which + * means starting a new run + */ + RESET_STATE_OF_RUN(); - else /* get to proper place for insertion */ - { - if (m->phys_page < nextcontaddr) - { - prevcontaddr = PPNUM_PREV(sort_list->phys_page); - nextcontaddr = PPNUM_NEXT(sort_list->phys_page); - contfirstprev = &sort_list; - contlast = sort_list; + } else if (m->wire_count || m->gobbled || + m->encrypted || m->encrypted_cleaning || m->cs_validated || m->cs_tainted || + m->error || m->absent || m->pageout_queue || m->laundry || m->wanted || m->precious || + m->cleaning || m->overwriting || m->restart || m->unusual || m->list_req_pending) { + /* + * page is in a transient state + * or a state we don't want to deal + * with, so don't consider it which + * means starting a new run + */ + RESET_STATE_OF_RUN(); + + } else if (!m->free && !m->active && !m->inactive && !m->speculative && !m->throttled) { + /* + * page needs to be on one of our queues + * in order for it to be stable behind the + * locks we hold at this point... + * if not, don't consider it which + * means starting a new run + */ + RESET_STATE_OF_RUN(); + + } else if (!m->free && (!m->tabled || m->busy)) { + /* + * pages on the free list are always 'busy' + * so we couldn't test for 'busy' in the check + * for the transient states... pages that are + * 'free' are never 'tabled', so we also couldn't + * test for 'tabled'. So we check here to make + * sure that a non-free page is not busy and is + * tabled on an object... + * if not, don't consider it which + * means starting a new run + */ + RESET_STATE_OF_RUN(); + + } else { + if (m->phys_page != prevcontaddr + 1) { npages = 1; + start_idx = page_idx; + } else { + npages++; } - for (m1 = NEXT_PAGE(contlast); - npages < contig_pages && - m1 != VM_PAGE_NULL && m1->phys_page < m->phys_page; - m1 = NEXT_PAGE(m1)) - { - if (m1->phys_page != nextcontaddr) { - prevcontaddr = PPNUM_PREV(m1->phys_page); - contfirstprev = NEXT_PAGE_PTR(contlast); - npages = 1; - } else { - npages++; + prevcontaddr = m->phys_page; + + if (m->pmapped || m->dirty) + substitute_needed++; + + if (m->free) { + free_considered++; + } + if ((free_considered + substitute_needed) > free_available) { + /* + * if we let this run continue + * we will end up dropping the vm_page_free_count + * below the reserve limit... we need to abort + * this run, but we can at least re-consider this + * page... thus the jump back to 'retry' + */ + RESET_STATE_OF_RUN(); + + if (free_available && considered <= MAX_CONSIDERED_BEFORE_YIELD) { + considered++; + goto retry; } - nextcontaddr = PPNUM_NEXT(m1->phys_page); - contlast = m1; + /* + * free_available == 0 + * so can't consider any free pages... if + * we went to retry in this case, we'd + * get stuck looking at the same page + * w/o making any forward progress + * we also want to take this path if we've already + * reached our limit that controls the lock latency + */ } + } + if (considered > MAX_CONSIDERED_BEFORE_YIELD && npages <= 1) { + + mutex_unlock(&vm_page_queue_free_lock); + vm_page_unlock_queues(); + + mutex_pause(0); + vm_page_lock_queues(); + mutex_lock(&vm_page_queue_free_lock); + + RESET_STATE_OF_RUN(); /* - * We may actually already have enough. - * This could happen if a previous prepend - * joined up two runs to meet our needs. - * If so, bail before we take the current - * page off the free queue. + * reset our free page limit since we + * dropped the lock protecting the vm_page_free_queue */ - if (npages == contig_pages) - break; + free_available = vm_page_free_count - vm_page_free_reserved; + considered = 0; +#if MACH_ASSERT + yielded++; +#endif + goto retry; + } + considered++; + } + m = VM_PAGE_NULL; + + if (npages != contig_pages) + mutex_unlock(&vm_page_queue_free_lock); + else { + vm_page_t m1; + vm_page_t m2; + unsigned int cur_idx; + unsigned int tmp_start_idx; + vm_object_t locked_object = VM_OBJECT_NULL; + boolean_t abort_run = FALSE; + + tmp_start_idx = start_idx; + + /* + * first pass through to pull the free pages + * off of the free queue so that in case we + * need substitute pages, we won't grab any + * of the free pages in the run... we'll clear + * the 'free' bit in the 2nd pass, and even in + * an abort_run case, we'll collect all of the + * free pages in this run and return them to the free list + */ + while (start_idx < page_idx) { + + m1 = &vm_pages[start_idx++]; + + if (m1->free) { + unsigned int color; + + color = m1->phys_page & vm_color_mask; + queue_remove(&vm_page_queue_free[color], + m1, + vm_page_t, + pageq); + + vm_page_free_count--; + } + } + /* + * adjust global freelist counts + */ + if (vm_page_free_count < vm_page_free_count_minimum) + vm_page_free_count_minimum = vm_page_free_count; + + /* + * we can drop the free queue lock at this point since + * we've pulled any 'free' candidates off of the list + * we need it dropped so that we can do a vm_page_grab + * when substituing for pmapped/dirty pages + */ + mutex_unlock(&vm_page_queue_free_lock); + + start_idx = tmp_start_idx; + cur_idx = page_idx - 1; + + while (start_idx++ < page_idx) { + /* + * must go through the list from back to front + * so that the page list is created in the + * correct order - low -> high phys addresses + */ + m1 = &vm_pages[cur_idx--]; + + if (m1->free) { + /* + * pages have already been removed from + * the free list in the 1st pass + */ + assert(m1->free); + assert(m1->busy); + assert(!m1->wanted); + assert(!m1->laundry); + m1->free = FALSE; - if (m->phys_page != nextcontaddr) - { - contfirstprev = NEXT_PAGE_PTR(contlast); - prevcontaddr = PPNUM_PREV(m->phys_page); - nextcontaddr = PPNUM_NEXT(m->phys_page); - npages = 1; } else { - nextcontaddr = PPNUM_NEXT(nextcontaddr); - npages++; + vm_object_t object; + + if (abort_run == TRUE) + continue; + + object = m1->object; + + if (object != locked_object) { + if (locked_object) { + vm_object_unlock(locked_object); + locked_object = VM_OBJECT_NULL; + } + if (vm_object_lock_try(object)) + locked_object = object; + } + if (locked_object == VM_OBJECT_NULL || + (m1->wire_count || m1->gobbled || + m1->encrypted || m1->encrypted_cleaning || m1->cs_validated || m1->cs_tainted || + m1->error || m1->absent || m1->pageout_queue || m1->laundry || m1->wanted || m1->precious || + m1->cleaning || m1->overwriting || m1->restart || m1->unusual || m1->list_req_pending || m1->busy)) { + + if (locked_object) { + vm_object_unlock(locked_object); + locked_object = VM_OBJECT_NULL; + } + tmp_start_idx = cur_idx; + abort_run = TRUE; + continue; + } + if (m1->pmapped || m1->dirty) { + int refmod; + vm_object_offset_t offset; + + m2 = vm_page_grab(); + + if (m2 == VM_PAGE_NULL) { + if (locked_object) { + vm_object_unlock(locked_object); + locked_object = VM_OBJECT_NULL; + } + tmp_start_idx = cur_idx; + abort_run = TRUE; + continue; + } + if (m1->pmapped) + refmod = pmap_disconnect(m1->phys_page); + else + refmod = 0; + vm_page_copy(m1, m2); + + m2->reference = m1->reference; + m2->dirty = m1->dirty; + + if (refmod & VM_MEM_REFERENCED) + m2->reference = TRUE; + if (refmod & VM_MEM_MODIFIED) + m2->dirty = TRUE; + offset = m1->offset; + + /* + * completely cleans up the state + * of the page so that it is ready + * to be put onto the free list, or + * for this purpose it looks like it + * just came off of the free list + */ + vm_page_free_prepare(m1); + + /* + * make sure we clear the ref/mod state + * from the pmap layer... else we risk + * inheriting state from the last time + * this page was used... + */ + pmap_clear_refmod(m2->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED); + /* + * now put the substitute page on the object + */ + vm_page_insert_internal(m2, locked_object, offset, TRUE); + + if (m2->reference) + vm_page_activate(m2); + else + vm_page_deactivate(m2); + + PAGE_WAKEUP_DONE(m2); + + } else { + /* + * completely cleans up the state + * of the page so that it is ready + * to be put onto the free list, or + * for this purpose it looks like it + * just came off of the free list + */ + vm_page_free_prepare(m1); + } +#if MACH_ASSERT + stolen_pages++; +#endif } - vm_page_queue_free = NEXT_PAGE(m); - cpm_counter(++vpfls_general_insertions); - SET_NEXT_PAGE(m, NEXT_PAGE(contlast)); - SET_NEXT_PAGE(contlast, m); - contlast = m; + m1->pageq.next = (queue_entry_t) m; + m1->pageq.prev = NULL; + m = m1; } - - /* See how many pages are now contiguous after the insertion */ - for (m1 = NEXT_PAGE(m); - npages < contig_pages && - m1 != VM_PAGE_NULL && m1->phys_page == nextcontaddr; - m1 = NEXT_PAGE(m1)) - { - nextcontaddr = PPNUM_NEXT(nextcontaddr); - contlast = m1; - npages++; + if (locked_object) { + vm_object_unlock(locked_object); + locked_object = VM_OBJECT_NULL; } - } - /* how did we do? */ - if (npages == contig_pages) - { - cpm_counter(++vpfc_satisfied); + if (abort_run == TRUE) { + if (m != VM_PAGE_NULL) { + vm_page_free_list(m); + } +#if MACH_ASSERT + dumped_run++; +#endif + /* + * want the index of the last + * page in this run that was + * successfully 'stolen', so back + * it up 1 for the auto-decrement on use + * and 1 more to bump back over this page + */ + page_idx = tmp_start_idx + 2; + + if (page_idx >= vm_pages_count) + goto done_scanning; - /* remove the contiguous range from the sorted list */ - m = *contfirstprev; - *contfirstprev = NEXT_PAGE(contlast); - SET_NEXT_PAGE(contlast, VM_PAGE_NULL); - assert(vm_page_verify_contiguous(m, npages)); + mutex_lock(&vm_page_queue_free_lock); + + RESET_STATE_OF_RUN(); + + /* + * reset our free page limit since we + * dropped the lock protecting the vm_page_free_queue + */ + free_available = vm_page_free_count - vm_page_free_reserved; + + goto retry; + } - /* inline vm_page_gobble() for each returned page */ for (m1 = m; m1 != VM_PAGE_NULL; m1 = NEXT_PAGE(m1)) { - assert(m1->free); - assert(!m1->wanted); - assert(!m1->laundry); - m1->free = FALSE; - m1->no_isync = TRUE; - m1->gobbled = TRUE; + + if (wire == TRUE) + m1->wire_count++; + else + m1->gobbled = TRUE; } + if (wire == FALSE) + vm_page_gobble_count += npages; + + /* + * gobbled pages are also counted as wired pages + */ vm_page_wire_count += npages; - vm_page_gobble_count += npages; - vm_page_free_count -= npages; - /* stick free list at the tail of the sorted list */ - while ((m1 = *contfirstprev) != VM_PAGE_NULL) - contfirstprev = (vm_page_t *)&m1->pageq.next; - *contfirstprev = vm_page_queue_free; + assert(vm_page_verify_contiguous(m, npages)); + } +done_scanning: + vm_page_unlock_queues(); + +#if MACH_ASSERT + clock_get_system_microtime(&tv_end_sec, &tv_end_usec); + + tv_end_sec -= tv_start_sec; + if (tv_end_usec < tv_start_usec) { + tv_end_sec--; + tv_end_usec += 1000000; + } + tv_end_usec -= tv_start_usec; + if (tv_end_usec >= 1000000) { + tv_end_sec++; + tv_end_sec -= 1000000; } + printf("vm_find_page_contiguous(num=%d,low=%d): found %d pages in %d.%06ds... scanned %d pages... yielded %d times... dumped run %d times... stole %d pages\n", + contig_pages, max_pnum, npages, tv_end_sec, tv_end_usec, page_idx, yielded, dumped_run, stolen_pages); - vm_page_queue_free = sort_list; + vm_page_verify_free_lists(); +#endif return m; } @@ -2543,78 +3392,52 @@ kern_return_t cpm_allocate( vm_size_t size, vm_page_t *list, + ppnum_t max_pnum, boolean_t wire) { - register vm_page_t m; vm_page_t pages; unsigned int npages; - unsigned int vm_pages_available; - boolean_t wakeup; if (size % page_size != 0) return KERN_INVALID_ARGUMENT; - vm_page_lock_queues(); - mutex_lock(&vm_page_queue_free_lock); - - /* - * Should also take active and inactive pages - * into account... One day... - */ npages = size / page_size; - vm_pages_available = vm_page_free_count - vm_page_free_reserved; - - if (npages > vm_pages_available) { - mutex_unlock(&vm_page_queue_free_lock); - vm_page_unlock_queues(); - return KERN_RESOURCE_SHORTAGE; - } /* * Obtain a pointer to a subset of the free * list large enough to satisfy the request; * the region will be physically contiguous. */ - pages = vm_page_find_contiguous(npages); - - /* adjust global freelist counts and determine need for wakeups */ - if (vm_page_free_count < vm_page_free_count_minimum) - vm_page_free_count_minimum = vm_page_free_count; + pages = vm_page_find_contiguous(npages, max_pnum, wire); - wakeup = ((vm_page_free_count < vm_page_free_min) || - ((vm_page_free_count < vm_page_free_target) && - (vm_page_inactive_count < vm_page_inactive_target))); - - mutex_unlock(&vm_page_queue_free_lock); - - if (pages == VM_PAGE_NULL) { - vm_page_unlock_queues(); + if (pages == VM_PAGE_NULL) return KERN_NO_SPACE; - } - /* - * Walk the returned list, wiring the pages. + * determine need for wakeups */ - if (wire == TRUE) - for (m = pages; m != VM_PAGE_NULL; m = NEXT_PAGE(m)) { - /* - * Essentially inlined vm_page_wire. - */ - assert(!m->active); - assert(!m->inactive); - assert(!m->private); - assert(!m->fictitious); - assert(m->wire_count == 0); - assert(m->gobbled); - m->gobbled = FALSE; - m->wire_count++; - --vm_page_gobble_count; - } - vm_page_unlock_queues(); - - if (wakeup) + if ((vm_page_free_count < vm_page_free_min) || + ((vm_page_free_count < vm_page_free_target) && + ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_min))) thread_wakeup((event_t) &vm_page_free_wanted); + +#if CONFIG_EMBEDDED + { + int percent_avail; + /* + * Decide if we need to poke the memorystatus notification thread. + */ + percent_avail = + (vm_page_active_count + vm_page_inactive_count + + vm_page_speculative_count + vm_page_free_count + + vm_page_purgeable_count ) * 100 / + atop_64(max_mem); + if (percent_avail <= (kern_memorystatus_level - 5)) { + kern_memorystatus_level = percent_avail; + thread_wakeup((event_t)&kern_memorystatus_wakeup); + } + } +#endif /* * The CPM pages should now be available and * ordered by ascending physical address. @@ -2624,7 +3447,7 @@ cpm_allocate( *list = pages; return KERN_SUCCESS; } - + #include #if MACH_VM_DEBUG @@ -2697,9 +3520,10 @@ vm_page_print( printf(", offset=0x%x", p->offset); printf(", wire_count=%d", p->wire_count); - iprintf("%sinactive, %sactive, %sgobbled, %slaundry, %sfree, %sref, %sencrypted\n", + iprintf("%sinactive, %sactive, %sthrottled, %sgobbled, %slaundry, %sfree, %sref, %sencrypted\n", (p->inactive ? "" : "!"), (p->active ? "" : "!"), + (p->throttled ? "" : "!"), (p->gobbled ? "" : "!"), (p->laundry ? "" : "!"), (p->free ? "" : "!"), @@ -2719,16 +3543,12 @@ vm_page_print( (p->cleaning ? "" : "!"), (p->pageout ? "" : "!"), (p->clustered ? "" : "!")); - iprintf("%slock_supplied, %soverwriting, %srestart, %sunusual\n", - (p->lock_supplied ? "" : "!"), + iprintf("%soverwriting, %srestart, %sunusual\n", (p->overwriting ? "" : "!"), (p->restart ? "" : "!"), (p->unusual ? "" : "!")); iprintf("phys_page=0x%x", p->phys_page); - printf(", page_error=0x%x", p->page_error); - printf(", page_lock=0x%x", p->page_lock); - printf(", unlock_request=%d\n", p->unlock_request); db_indent -= 2; }