[apple/xnu.git] / osfmk / vm / vm_phantom_cache.c

/*
 * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#include <vm/vm_page.h>
#include <vm/vm_object.h>
#include <vm/vm_kern.h>
#include <vm/vm_pageout.h>
#include <vm/vm_phantom_cache.h>
#include <vm/vm_compressor.h>


uint32_t phantom_cache_eval_period_in_msecs = 250;
uint32_t phantom_cache_thrashing_threshold_ssd = 1000;
uint32_t phantom_cache_thrashing_threshold = 100;

/*
 * Number of consecutive thrashing periods required before
 * vm_phantom_cache_check_pressure() returns true.
 */
unsigned phantom_cache_contiguous_periods = 2;

clock_sec_t	pc_start_of_eval_period_sec = 0;
clock_nsec_t	pc_start_of_eval_period_nsec = 0;
boolean_t	pc_need_eval_reset = FALSE;

/* One bit per recent sampling period. Bit 0 = current period. */
uint32_t	pc_history = 0;

uint32_t	sample_period_ghost_added_count = 0;
uint32_t	sample_period_ghost_added_count_ssd = 0;
uint32_t	sample_period_ghost_found_count = 0;
uint32_t	sample_period_ghost_found_count_ssd = 0;

uint32_t	vm_phantom_object_id = 1;
#define		VM_PHANTOM_OBJECT_ID_AFTER_WRAP	1000000

vm_ghost_t	vm_phantom_cache;
uint32_t	vm_phantom_cache_nindx = 1;
uint32_t	vm_phantom_cache_num_entries = 0;
uint32_t	vm_phantom_cache_size;

typedef	uint32_t	vm_phantom_hash_entry_t;
vm_phantom_hash_entry_t	*vm_phantom_cache_hash;
uint32_t	vm_phantom_cache_hash_size;
uint32_t	vm_ghost_hash_mask;		/* Mask for hash function */
uint32_t	vm_ghost_bucket_hash;		/* Basic bucket hash */


int pg_masks[4] = {
	0x1, 0x2, 0x4, 0x8
};


#define vm_phantom_hash(obj_id, offset) (\
		( (natural_t)((uintptr_t)obj_id * vm_ghost_bucket_hash) + (offset ^ vm_ghost_bucket_hash)) & vm_ghost_hash_mask)


struct phantom_cache_stats {
	uint32_t	pcs_wrapped;
	uint32_t	pcs_added_page_to_entry;
	uint32_t	pcs_added_new_entry;
	uint32_t	pcs_replaced_entry;

	uint32_t	pcs_lookup_found_page_in_cache;
	uint32_t	pcs_lookup_entry_not_in_cache;
	uint32_t	pcs_lookup_page_not_in_entry;

	uint32_t	pcs_updated_phantom_state;
} phantom_cache_stats;


void
vm_phantom_cache_init()
{
	unsigned int	num_entries;
	unsigned int	log1;
	unsigned int	size;

	if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE)
		return;
	num_entries = (uint32_t)(((max_mem / PAGE_SIZE) / 4) / VM_GHOST_PAGES_PER_ENTRY);
	vm_phantom_cache_num_entries = 1;

	while (vm_phantom_cache_num_entries < num_entries)
		vm_phantom_cache_num_entries <<= 1;

	vm_phantom_cache_size = sizeof(struct vm_ghost) * vm_phantom_cache_num_entries;
	vm_phantom_cache_hash_size = sizeof(vm_phantom_hash_entry_t) * vm_phantom_cache_num_entries;

	if (kernel_memory_allocate(kernel_map, (vm_offset_t *)(&vm_phantom_cache), vm_phantom_cache_size, 0, KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_PHANTOM_CACHE) != KERN_SUCCESS)
		panic("vm_phantom_cache_init: kernel_memory_allocate failed\n");
	bzero(vm_phantom_cache, vm_phantom_cache_size);

	if (kernel_memory_allocate(kernel_map, (vm_offset_t *)(&vm_phantom_cache_hash), vm_phantom_cache_hash_size, 0, KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_PHANTOM_CACHE) != KERN_SUCCESS)
		panic("vm_phantom_cache_init: kernel_memory_allocate failed\n");
	bzero(vm_phantom_cache_hash, vm_phantom_cache_hash_size);


	vm_ghost_hash_mask = vm_phantom_cache_num_entries - 1;

	/*
	 *	Calculate object_id shift value for hashing algorithm:
	 *		O = log2(sizeof(struct vm_object))
	 *		B = log2(vm_page_bucket_count)
	 *	        hash shifts the object_id left by
	 *		B/2 - O
	 */
	size = vm_phantom_cache_num_entries;
	for (log1 = 0; size > 1; log1++) 
		size /= 2;
	
	vm_ghost_bucket_hash = 1 << ((log1 + 1) >> 1);		/* Get (ceiling of sqrt of table size) */
	vm_ghost_bucket_hash |= 1 << ((log1 + 1) >> 2);		/* Get (ceiling of quadroot of table size) */
	vm_ghost_bucket_hash |= 1;				/* Set bit and add 1 - always must be 1 to insure unique series */

	if (vm_ghost_hash_mask & vm_phantom_cache_num_entries)
		printf("vm_phantom_cache_init: WARNING -- strange page hash\n");
}


void
vm_phantom_cache_add_ghost(vm_page_t m)
{
	vm_ghost_t	vpce;
	vm_object_t	object;
	int		ghost_index;
	int		pg_mask;
	boolean_t	isSSD = FALSE;
	vm_phantom_hash_entry_t ghost_hash_index;

	object = VM_PAGE_OBJECT(m);

	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
	vm_object_lock_assert_exclusive(object);

	if (vm_phantom_cache_num_entries == 0)
		return;
	
	pg_mask = pg_masks[(m->offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK];

	if (object->phantom_object_id == 0) {

		vnode_pager_get_isSSD(object->pager, &isSSD);

		if (isSSD == TRUE)
			object->phantom_isssd = TRUE;

		object->phantom_object_id = vm_phantom_object_id++;
		
		if (vm_phantom_object_id == 0)
			vm_phantom_object_id = VM_PHANTOM_OBJECT_ID_AFTER_WRAP;
	} else {
		if ( (vpce = vm_phantom_cache_lookup_ghost(m, 0)) ) {
			vpce->g_pages_held |= pg_mask;
			
			phantom_cache_stats.pcs_added_page_to_entry++;
			goto done;
		}
	}
	/*
	 * if we're here then the vm_ghost_t of this vm_page_t
	 * is not present in the phantom cache... take the next
	 * available entry in the LRU first evicting the existing
	 * entry if we've wrapped the ring
	 */
	ghost_index = vm_phantom_cache_nindx++;

	if (vm_phantom_cache_nindx == vm_phantom_cache_num_entries) {
		vm_phantom_cache_nindx = 1;

		phantom_cache_stats.pcs_wrapped++;
	}
	vpce = &vm_phantom_cache[ghost_index];

	if (vpce->g_obj_id) {
		/*
		 * we're going to replace an existing entry
		 * so first remove it from the hash
		 */
		vm_ghost_t	nvpce;

		ghost_hash_index = vm_phantom_hash(vpce->g_obj_id, vpce->g_obj_offset);

		nvpce = &vm_phantom_cache[vm_phantom_cache_hash[ghost_hash_index]];

		if (nvpce == vpce) {
			vm_phantom_cache_hash[ghost_hash_index] = vpce->g_next_index;
		} else {
			for (;;) {
				if (nvpce->g_next_index == 0)
					panic("didn't find ghost in hash\n");

				if (&vm_phantom_cache[nvpce->g_next_index] == vpce) {
					nvpce->g_next_index = vpce->g_next_index;
					break;
				}
				nvpce = &vm_phantom_cache[nvpce->g_next_index];
			}
		}
		phantom_cache_stats.pcs_replaced_entry++;
	} else
		phantom_cache_stats.pcs_added_new_entry++;

	vpce->g_pages_held = pg_mask;
	vpce->g_obj_offset = (m->offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK;
	vpce->g_obj_id = object->phantom_object_id;

	ghost_hash_index = vm_phantom_hash(vpce->g_obj_id, vpce->g_obj_offset);
	vpce->g_next_index = vm_phantom_cache_hash[ghost_hash_index];
	vm_phantom_cache_hash[ghost_hash_index] = ghost_index;

done:
	if (object->phantom_isssd)
		OSAddAtomic(1, &sample_period_ghost_added_count_ssd);
	else
		OSAddAtomic(1, &sample_period_ghost_added_count);
}


vm_ghost_t
vm_phantom_cache_lookup_ghost(vm_page_t m, uint32_t pg_mask)
{
	uint64_t	g_obj_offset;
	uint32_t	g_obj_id;
	uint32_t	ghost_index;
	vm_object_t	object;

	object = VM_PAGE_OBJECT(m);

	if ((g_obj_id = object->phantom_object_id) == 0) {
		/*
		 * no entries in phantom cache for this object
		 */
		return (NULL);
	}
	g_obj_offset = (m->offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK;

	ghost_index = vm_phantom_cache_hash[vm_phantom_hash(g_obj_id, g_obj_offset)];

	while (ghost_index) {
		vm_ghost_t      vpce;

		vpce = &vm_phantom_cache[ghost_index];

		if (vpce->g_obj_id == g_obj_id && vpce->g_obj_offset == g_obj_offset) {

			if (pg_mask == 0 || (vpce->g_pages_held & pg_mask)) {
				phantom_cache_stats.pcs_lookup_found_page_in_cache++;

				return (vpce);
			}
			phantom_cache_stats.pcs_lookup_page_not_in_entry++;

			return (NULL);
		}
		ghost_index = vpce->g_next_index;
	}
	phantom_cache_stats.pcs_lookup_entry_not_in_cache++;

	return (NULL);
}


void
vm_phantom_cache_update(vm_page_t m)
{
	int		pg_mask;
	vm_ghost_t      vpce;
	vm_object_t	object;

	object = VM_PAGE_OBJECT(m);

	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
	vm_object_lock_assert_exclusive(object);

	if (vm_phantom_cache_num_entries == 0)
		return;
	
	pg_mask = pg_masks[(m->offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK];
	
	if ( (vpce = vm_phantom_cache_lookup_ghost(m, pg_mask)) ) {

		vpce->g_pages_held &= ~pg_mask;

		phantom_cache_stats.pcs_updated_phantom_state++;

		if (object->phantom_isssd)
			OSAddAtomic(1, &sample_period_ghost_found_count_ssd);
		else
			OSAddAtomic(1, &sample_period_ghost_found_count);
	}
}


#define	PHANTOM_CACHE_DEBUG	1

#if	PHANTOM_CACHE_DEBUG

int	sample_period_ghost_counts_indx = 0;

struct {
	uint32_t	added;
	uint32_t	found;
	uint32_t	added_ssd;
	uint32_t	found_ssd;
	uint32_t	elapsed_ms;
	boolean_t	pressure_detected;
} sample_period_ghost_counts[256];

#endif

/*
 * Determine if the file cache is thrashing from sampling interval statistics.
 *
 * Pages added to the phantom cache = pages evicted from the file cache.
 * Pages found in the phantom cache = reads of pages that were recently evicted.
 * Threshold is the latency-dependent number of reads we consider thrashing.
 */
static boolean_t
is_thrashing(uint32_t added, uint32_t found, uint32_t threshold)
{
	/* Ignore normal activity below the threshold. */
	if (added < threshold || found < threshold)
		return FALSE;

	/*
	 * When thrashing in a way that we can mitigate, most of the pages read
	 * into the file cache were recently evicted, and 'found' will be close
	 * to 'added'.
	 *
	 * When replacing the current working set because a new app is
	 * launched, we see very high read traffic with sporadic phantom cache
	 * hits.
	 *
	 * This is not thrashing, or freeing up memory wouldn't help much
	 * anyway.
	 */
	if (found < added / 2)
		return FALSE;

	return TRUE;
}

/*
 * the following function is never called
 * from multiple threads simultaneously due
 * to a condition variable used to serialize
 * at the compressor level... thus no need
 * to provide locking for the sample processing
 */
boolean_t
vm_phantom_cache_check_pressure()
{
        clock_sec_t	cur_ts_sec;
        clock_nsec_t	cur_ts_nsec;
	uint64_t	elapsed_msecs_in_eval;
	boolean_t	pressure_detected = FALSE;

	clock_get_system_nanotime(&cur_ts_sec, &cur_ts_nsec);

	elapsed_msecs_in_eval = vm_compressor_compute_elapsed_msecs(cur_ts_sec, cur_ts_nsec, pc_start_of_eval_period_sec, pc_start_of_eval_period_nsec);

	/*
	 * Reset evaluation period after phantom_cache_eval_period_in_msecs or
	 * whenever vm_phantom_cache_restart_sample has been called.
	 */
	if (elapsed_msecs_in_eval >= phantom_cache_eval_period_in_msecs) {
		pc_need_eval_reset = TRUE;
	}

	if (pc_need_eval_reset == TRUE) {

#if PHANTOM_CACHE_DEBUG
		/*
		 * maintain some info about the last 256 sample periods
		 */
		sample_period_ghost_counts[sample_period_ghost_counts_indx].added = sample_period_ghost_added_count;
		sample_period_ghost_counts[sample_period_ghost_counts_indx].found = sample_period_ghost_found_count;
		sample_period_ghost_counts[sample_period_ghost_counts_indx].added_ssd = sample_period_ghost_added_count_ssd;
		sample_period_ghost_counts[sample_period_ghost_counts_indx].found_ssd = sample_period_ghost_found_count_ssd;
		sample_period_ghost_counts[sample_period_ghost_counts_indx].elapsed_ms = (uint32_t)elapsed_msecs_in_eval;

		sample_period_ghost_counts_indx++;

		if (sample_period_ghost_counts_indx >= 256)
			sample_period_ghost_counts_indx = 0;
#endif
		sample_period_ghost_added_count = 0;
		sample_period_ghost_found_count = 0;
		sample_period_ghost_added_count_ssd = 0;
		sample_period_ghost_found_count_ssd = 0;

		pc_start_of_eval_period_sec = cur_ts_sec;
		pc_start_of_eval_period_nsec = cur_ts_nsec;
		pc_history <<= 1;
		pc_need_eval_reset = FALSE;
	} else {
		/*
		 * Since the trashing rate is really a function of the read latency of the disk
		 * we have to consider both the SSD and spinning disk case since the file cache
		 * could be backed by either or even both flavors.  When the object is first
		 * assigned a phantom_object_id, we query the pager to determine if the backing
		 * backing media is an SSD and remember that answer in the vm_object.  We use
		 * that info to maintains counts for both the SSD and spinning disk cases.
		 */
		if (is_thrashing(sample_period_ghost_added_count,
				 sample_period_ghost_found_count,
				 phantom_cache_thrashing_threshold) ||
		    is_thrashing(sample_period_ghost_added_count_ssd,
				 sample_period_ghost_found_count_ssd,
				 phantom_cache_thrashing_threshold_ssd)) {
			/* Thrashing in the current period: Set bit 0. */
			pc_history |= 1;
		}
	}

	/*
	 * Declare pressure_detected after phantom_cache_contiguous_periods.
	 *
	 * Create a bitmask with the N low bits set. These bits must all be set
	 * in pc_history. The high bits of pc_history are ignored.
	 */
	uint32_t bitmask = (1u << phantom_cache_contiguous_periods) - 1;
	if ((pc_history & bitmask) == bitmask)
		pressure_detected = TRUE;

	if (vm_page_external_count > ((AVAILABLE_MEMORY) * 50) / 100)
		pressure_detected = FALSE;

#if PHANTOM_CACHE_DEBUG
	sample_period_ghost_counts[sample_period_ghost_counts_indx].pressure_detected = pressure_detected;
#endif
	return (pressure_detected);
}

/*
 * Restart the current sampling because conditions have changed significantly,
 * and we don't want to react to old data.
 *
 * This function can be called from any thread.
 */
void
vm_phantom_cache_restart_sample(void)
{
	pc_need_eval_reset = TRUE;
}
Commit	Line	Data
fe8ab488 A	1	/*
	2	* Copyright (c) 2000-2013 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29	#include <vm/vm_page.h>
	30	#include <vm/vm_object.h>
	31	#include <vm/vm_kern.h>
	32	#include <vm/vm_pageout.h>
	33	#include <vm/vm_phantom_cache.h>
	34	#include <vm/vm_compressor.h>
	35
	36
	37	uint32_t phantom_cache_eval_period_in_msecs = 250;
	38	uint32_t phantom_cache_thrashing_threshold_ssd = 1000;
	39	uint32_t phantom_cache_thrashing_threshold = 100;
	40
	41	/*
	42	* Number of consecutive thrashing periods required before
	43	* vm_phantom_cache_check_pressure() returns true.
	44	*/
	45	unsigned phantom_cache_contiguous_periods = 2;
	46
	47	clock_sec_t pc_start_of_eval_period_sec = 0;
	48	clock_nsec_t pc_start_of_eval_period_nsec = 0;
	49	boolean_t pc_need_eval_reset = FALSE;
	50
	51	/* One bit per recent sampling period. Bit 0 = current period. */
	52	uint32_t pc_history = 0;
	53
	54	uint32_t sample_period_ghost_added_count = 0;
	55	uint32_t sample_period_ghost_added_count_ssd = 0;
	56	uint32_t sample_period_ghost_found_count = 0;
	57	uint32_t sample_period_ghost_found_count_ssd = 0;
	58
	59	uint32_t vm_phantom_object_id = 1;
	60	#define VM_PHANTOM_OBJECT_ID_AFTER_WRAP 1000000
	61
	62	vm_ghost_t vm_phantom_cache;
	63	uint32_t vm_phantom_cache_nindx = 1;
	64	uint32_t vm_phantom_cache_num_entries = 0;
65	uint32_t vm_phantom_cache_size;
66
67	typedef uint32_t vm_phantom_hash_entry_t;
68	vm_phantom_hash_entry_t *vm_phantom_cache_hash;
69	uint32_t vm_phantom_cache_hash_size;
70	uint32_t vm_ghost_hash_mask; /* Mask for hash function */
71	uint32_t vm_ghost_bucket_hash; /* Basic bucket hash */
72
73
74	int pg_masks[4] = {
75	0x1, 0x2, 0x4, 0x8
76	};
77
78
79	#define vm_phantom_hash(obj_id, offset) (\
80	( (natural_t)((uintptr_t)obj_id * vm_ghost_bucket_hash) + (offset ^ vm_ghost_bucket_hash)) & vm_ghost_hash_mask)
81
82
83	struct phantom_cache_stats {
84	uint32_t pcs_wrapped;
85	uint32_t pcs_added_page_to_entry;
86	uint32_t pcs_added_new_entry;
87	uint32_t pcs_replaced_entry;
88
89	uint32_t pcs_lookup_found_page_in_cache;
90	uint32_t pcs_lookup_entry_not_in_cache;
91	uint32_t pcs_lookup_page_not_in_entry;
92
93	uint32_t pcs_updated_phantom_state;
94	} phantom_cache_stats;
95
96
97	void
98	vm_phantom_cache_init()
99	{
100	unsigned int num_entries;
101	unsigned int log1;
102	unsigned int size;
103
39037602 A	104	if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE)
39037602 A	105	return;
fe8ab488 A	106	num_entries = (uint32_t)(((max_mem / PAGE_SIZE) / 4) / VM_GHOST_PAGES_PER_ENTRY);
	107	vm_phantom_cache_num_entries = 1;
	108
	109	while (vm_phantom_cache_num_entries < num_entries)
	110	vm_phantom_cache_num_entries <<= 1;
	111
	112	vm_phantom_cache_size = sizeof(struct vm_ghost) * vm_phantom_cache_num_entries;
	113	vm_phantom_cache_hash_size = sizeof(vm_phantom_hash_entry_t) * vm_phantom_cache_num_entries;
	114
3e170ce0	115	if (kernel_memory_allocate(kernel_map, (vm_offset_t *)(&vm_phantom_cache), vm_phantom_cache_size, 0, KMA_KOBJECT \| KMA_PERMANENT, VM_KERN_MEMORY_PHANTOM_CACHE) != KERN_SUCCESS)
fe8ab488 A	116	panic("vm_phantom_cache_init: kernel_memory_allocate failed\n");
	117	bzero(vm_phantom_cache, vm_phantom_cache_size);
	118
3e170ce0	119	if (kernel_memory_allocate(kernel_map, (vm_offset_t *)(&vm_phantom_cache_hash), vm_phantom_cache_hash_size, 0, KMA_KOBJECT \| KMA_PERMANENT, VM_KERN_MEMORY_PHANTOM_CACHE) != KERN_SUCCESS)
fe8ab488 A	120	panic("vm_phantom_cache_init: kernel_memory_allocate failed\n");
	121	bzero(vm_phantom_cache_hash, vm_phantom_cache_hash_size);
	122
	123
	124	vm_ghost_hash_mask = vm_phantom_cache_num_entries - 1;
	125
	126	/*
	127	* Calculate object_id shift value for hashing algorithm:
	128	* O = log2(sizeof(struct vm_object))
	129	* B = log2(vm_page_bucket_count)
	130	* hash shifts the object_id left by
	131	* B/2 - O
	132	*/
	133	size = vm_phantom_cache_num_entries;
	134	for (log1 = 0; size > 1; log1++)
	135	size /= 2;
	136
	137	vm_ghost_bucket_hash = 1 << ((log1 + 1) >> 1); /* Get (ceiling of sqrt of table size) */
	138	vm_ghost_bucket_hash \|= 1 << ((log1 + 1) >> 2); /* Get (ceiling of quadroot of table size) */
	139	vm_ghost_bucket_hash \|= 1; /* Set bit and add 1 - always must be 1 to insure unique series */
	140
	141	if (vm_ghost_hash_mask & vm_phantom_cache_num_entries)
	142	printf("vm_phantom_cache_init: WARNING -- strange page hash\n");
	143	}
	144
	145
	146	void
	147	vm_phantom_cache_add_ghost(vm_page_t m)
	148	{
	149	vm_ghost_t vpce;
39037602	150	vm_object_t object;
fe8ab488 A	151	int ghost_index;
	152	int pg_mask;
	153	boolean_t isSSD = FALSE;
	154	vm_phantom_hash_entry_t ghost_hash_index;
	155
39037602 A	156	object = VM_PAGE_OBJECT(m);
	157
	158	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
	159	vm_object_lock_assert_exclusive(object);
fe8ab488 A	160
	161	if (vm_phantom_cache_num_entries == 0)
	162	return;
	163
	164	pg_mask = pg_masks[(m->offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK];
	165
39037602	166	if (object->phantom_object_id == 0) {
fe8ab488	167
39037602	168	vnode_pager_get_isSSD(object->pager, &isSSD);
fe8ab488 A	169
fe8ab488 A	170	if (isSSD == TRUE)
39037602	171	object->phantom_isssd = TRUE;
fe8ab488	172
39037602	173	object->phantom_object_id = vm_phantom_object_id++;
fe8ab488 A	174
	175	if (vm_phantom_object_id == 0)
	176	vm_phantom_object_id = VM_PHANTOM_OBJECT_ID_AFTER_WRAP;
	177	} else {
	178	if ( (vpce = vm_phantom_cache_lookup_ghost(m, 0)) ) {
	179	vpce->g_pages_held \|= pg_mask;
	180
	181	phantom_cache_stats.pcs_added_page_to_entry++;
	182	goto done;
	183	}
	184	}
	185	/*
	186	* if we're here then the vm_ghost_t of this vm_page_t
	187	* is not present in the phantom cache... take the next
	188	* available entry in the LRU first evicting the existing
	189	* entry if we've wrapped the ring
	190	*/
	191	ghost_index = vm_phantom_cache_nindx++;
	192
	193	if (vm_phantom_cache_nindx == vm_phantom_cache_num_entries) {
	194	vm_phantom_cache_nindx = 1;
	195
	196	phantom_cache_stats.pcs_wrapped++;
	197	}
	198	vpce = &vm_phantom_cache[ghost_index];
	199
	200	if (vpce->g_obj_id) {
	201	/*
	202	* we're going to replace an existing entry
	203	* so first remove it from the hash
	204	*/
	205	vm_ghost_t nvpce;
	206
	207	ghost_hash_index = vm_phantom_hash(vpce->g_obj_id, vpce->g_obj_offset);
	208
	209	nvpce = &vm_phantom_cache[vm_phantom_cache_hash[ghost_hash_index]];
	210
	211	if (nvpce == vpce) {
	212	vm_phantom_cache_hash[ghost_hash_index] = vpce->g_next_index;
	213	} else {
	214	for (;;) {
	215	if (nvpce->g_next_index == 0)
	216	panic("didn't find ghost in hash\n");
	217
	218	if (&vm_phantom_cache[nvpce->g_next_index] == vpce) {
	219	nvpce->g_next_index = vpce->g_next_index;
	220	break;
	221	}
	222	nvpce = &vm_phantom_cache[nvpce->g_next_index];
	223	}
	224	}
	225	phantom_cache_stats.pcs_replaced_entry++;
	226	} else
	227	phantom_cache_stats.pcs_added_new_entry++;
	228
	229	vpce->g_pages_held = pg_mask;
	230	vpce->g_obj_offset = (m->offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK;
39037602	231	vpce->g_obj_id = object->phantom_object_id;
fe8ab488 A	232
	233	ghost_hash_index = vm_phantom_hash(vpce->g_obj_id, vpce->g_obj_offset);
	234	vpce->g_next_index = vm_phantom_cache_hash[ghost_hash_index];
	235	vm_phantom_cache_hash[ghost_hash_index] = ghost_index;
	236
	237	done:
39037602	238	if (object->phantom_isssd)
fe8ab488 A	239	OSAddAtomic(1, &sample_period_ghost_added_count_ssd);
	240	else
	241	OSAddAtomic(1, &sample_period_ghost_added_count);
	242	}
	243
	244
	245	vm_ghost_t
	246	vm_phantom_cache_lookup_ghost(vm_page_t m, uint32_t pg_mask)
	247	{
	248	uint64_t g_obj_offset;
	249	uint32_t g_obj_id;
	250	uint32_t ghost_index;
39037602	251	vm_object_t object;
fe8ab488	252
39037602 A	253	object = VM_PAGE_OBJECT(m);
	254
	255	if ((g_obj_id = object->phantom_object_id) == 0) {
fe8ab488 A	256	/*
	257	* no entries in phantom cache for this object
	258	*/
	259	return (NULL);
	260	}
	261	g_obj_offset = (m->offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK;
	262
	263	ghost_index = vm_phantom_cache_hash[vm_phantom_hash(g_obj_id, g_obj_offset)];
	264
	265	while (ghost_index) {
	266	vm_ghost_t vpce;
	267
	268	vpce = &vm_phantom_cache[ghost_index];
	269
	270	if (vpce->g_obj_id == g_obj_id && vpce->g_obj_offset == g_obj_offset) {
	271
	272	if (pg_mask == 0 \|\| (vpce->g_pages_held & pg_mask)) {
	273	phantom_cache_stats.pcs_lookup_found_page_in_cache++;
	274
	275	return (vpce);
	276	}
	277	phantom_cache_stats.pcs_lookup_page_not_in_entry++;
	278
	279	return (NULL);
	280	}
	281	ghost_index = vpce->g_next_index;
	282	}
	283	phantom_cache_stats.pcs_lookup_entry_not_in_cache++;
	284
	285	return (NULL);
	286	}
	287
	288
	289
	290	void
	291	vm_phantom_cache_update(vm_page_t m)
	292	{
	293	int pg_mask;
	294	vm_ghost_t vpce;
39037602	295	vm_object_t object;
fe8ab488	296
39037602 A	297	object = VM_PAGE_OBJECT(m);
	298
	299	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
	300	vm_object_lock_assert_exclusive(object);
fe8ab488 A	301
	302	if (vm_phantom_cache_num_entries == 0)
	303	return;
	304
	305	pg_mask = pg_masks[(m->offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK];
	306
	307	if ( (vpce = vm_phantom_cache_lookup_ghost(m, pg_mask)) ) {
	308
	309	vpce->g_pages_held &= ~pg_mask;
	310
	311	phantom_cache_stats.pcs_updated_phantom_state++;
	312
39037602	313	if (object->phantom_isssd)
fe8ab488 A	314	OSAddAtomic(1, &sample_period_ghost_found_count_ssd);
	315	else
	316	OSAddAtomic(1, &sample_period_ghost_found_count);
	317	}
	318	}
	319
	320
	321	#define PHANTOM_CACHE_DEBUG 1
	322
	323	#if PHANTOM_CACHE_DEBUG
	324
	325	int sample_period_ghost_counts_indx = 0;
	326
	327	struct {
	328	uint32_t added;
	329	uint32_t found;
	330	uint32_t added_ssd;
	331	uint32_t found_ssd;
	332	uint32_t elapsed_ms;
	333	boolean_t pressure_detected;
	334	} sample_period_ghost_counts[256];
	335
	336	#endif
	337
	338	/*
	339	* Determine if the file cache is thrashing from sampling interval statistics.
	340	*
	341	* Pages added to the phantom cache = pages evicted from the file cache.
	342	* Pages found in the phantom cache = reads of pages that were recently evicted.
	343	* Threshold is the latency-dependent number of reads we consider thrashing.
	344	*/
	345	static boolean_t
	346	is_thrashing(uint32_t added, uint32_t found, uint32_t threshold)
	347	{
	348	/* Ignore normal activity below the threshold. */
	349	if (added < threshold \|\| found < threshold)
	350	return FALSE;
	351
	352	/*
	353	* When thrashing in a way that we can mitigate, most of the pages read
	354	* into the file cache were recently evicted, and 'found' will be close
	355	* to 'added'.
	356	*
	357	* When replacing the current working set because a new app is
	358	* launched, we see very high read traffic with sporadic phantom cache
	359	* hits.
	360	*
	361	* This is not thrashing, or freeing up memory wouldn't help much
	362	* anyway.
	363	*/
	364	if (found < added / 2)
	365	return FALSE;
	366
	367	return TRUE;
	368	}
	369
	370	/*
	371	* the following function is never called
	372	* from multiple threads simultaneously due
	373	* to a condition variable used to serialize
	374	* at the compressor level... thus no need
	375	* to provide locking for the sample processing
	376	*/
	377	boolean_t
378	vm_phantom_cache_check_pressure()
379	{
380	clock_sec_t cur_ts_sec;
381	clock_nsec_t cur_ts_nsec;
382	uint64_t elapsed_msecs_in_eval;
383	boolean_t pressure_detected = FALSE;
384
385	clock_get_system_nanotime(&cur_ts_sec, &cur_ts_nsec);
386
387	elapsed_msecs_in_eval = vm_compressor_compute_elapsed_msecs(cur_ts_sec, cur_ts_nsec, pc_start_of_eval_period_sec, pc_start_of_eval_period_nsec);
388
389	/*
390	* Reset evaluation period after phantom_cache_eval_period_in_msecs or
391	* whenever vm_phantom_cache_restart_sample has been called.
392	*/
393	if (elapsed_msecs_in_eval >= phantom_cache_eval_period_in_msecs) {
394	pc_need_eval_reset = TRUE;
395	}
396
397	if (pc_need_eval_reset == TRUE) {
398
399	#if PHANTOM_CACHE_DEBUG
400	/*
401	* maintain some info about the last 256 sample periods
402	*/
403	sample_period_ghost_counts[sample_period_ghost_counts_indx].added = sample_period_ghost_added_count;
404	sample_period_ghost_counts[sample_period_ghost_counts_indx].found = sample_period_ghost_found_count;
405	sample_period_ghost_counts[sample_period_ghost_counts_indx].added_ssd = sample_period_ghost_added_count_ssd;
406	sample_period_ghost_counts[sample_period_ghost_counts_indx].found_ssd = sample_period_ghost_found_count_ssd;
407	sample_period_ghost_counts[sample_period_ghost_counts_indx].elapsed_ms = (uint32_t)elapsed_msecs_in_eval;
408
409	sample_period_ghost_counts_indx++;
410
411	if (sample_period_ghost_counts_indx >= 256)
412	sample_period_ghost_counts_indx = 0;
413	#endif
414	sample_period_ghost_added_count = 0;
415	sample_period_ghost_found_count = 0;
416	sample_period_ghost_added_count_ssd = 0;
417	sample_period_ghost_found_count_ssd = 0;
418
419	pc_start_of_eval_period_sec = cur_ts_sec;
420	pc_start_of_eval_period_nsec = cur_ts_nsec;
421	pc_history <<= 1;
422	pc_need_eval_reset = FALSE;
423	} else {
424	/*
425	* Since the trashing rate is really a function of the read latency of the disk
426	* we have to consider both the SSD and spinning disk case since the file cache
427	* could be backed by either or even both flavors. When the object is first
428	* assigned a phantom_object_id, we query the pager to determine if the backing
429	* backing media is an SSD and remember that answer in the vm_object. We use
430	* that info to maintains counts for both the SSD and spinning disk cases.
431	*/
432	if (is_thrashing(sample_period_ghost_added_count,
433	sample_period_ghost_found_count,
434	phantom_cache_thrashing_threshold) \|\|
435	is_thrashing(sample_period_ghost_added_count_ssd,
436	sample_period_ghost_found_count_ssd,
437	phantom_cache_thrashing_threshold_ssd)) {
438	/* Thrashing in the current period: Set bit 0. */
439	pc_history \|= 1;
440	}
441	}
442
443	/*
444	* Declare pressure_detected after phantom_cache_contiguous_periods.
445	*
446	* Create a bitmask with the N low bits set. These bits must all be set
447	* in pc_history. The high bits of pc_history are ignored.
448	*/
449	uint32_t bitmask = (1u << phantom_cache_contiguous_periods) - 1;
450	if ((pc_history & bitmask) == bitmask)
451	pressure_detected = TRUE;
452
453	if (vm_page_external_count > ((AVAILABLE_MEMORY) * 50) / 100)
454	pressure_detected = FALSE;
455
456	#if PHANTOM_CACHE_DEBUG
457	sample_period_ghost_counts[sample_period_ghost_counts_indx].pressure_detected = pressure_detected;
458	#endif
459	return (pressure_detected);
460	}
461
462	/*
463	* Restart the current sampling because conditions have changed significantly,
464	* and we don't want to react to old data.
465	*
466	* This function can be called from any thread.
467	*/
468	void
469	vm_phantom_cache_restart_sample(void)
470	{
471	pc_need_eval_reset = TRUE;
472	}