2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <vm/vm_page.h>
30 #include <vm/vm_object.h>
31 #include <vm/vm_kern.h>
32 #include <vm/vm_pageout.h>
33 #include <vm/vm_phantom_cache.h>
34 #include <vm/vm_compressor.h>
37 uint32_t phantom_cache_eval_period_in_msecs
= 250;
38 uint32_t phantom_cache_thrashing_threshold_ssd
= 1000;
39 uint32_t phantom_cache_thrashing_threshold
= 100;
42 * Number of consecutive thrashing periods required before
43 * vm_phantom_cache_check_pressure() returns true.
45 unsigned phantom_cache_contiguous_periods
= 2;
47 clock_sec_t pc_start_of_eval_period_sec
= 0;
48 clock_nsec_t pc_start_of_eval_period_nsec
= 0;
49 boolean_t pc_need_eval_reset
= FALSE
;
51 /* One bit per recent sampling period. Bit 0 = current period. */
52 uint32_t pc_history
= 0;
54 uint32_t sample_period_ghost_added_count
= 0;
55 uint32_t sample_period_ghost_added_count_ssd
= 0;
56 uint32_t sample_period_ghost_found_count
= 0;
57 uint32_t sample_period_ghost_found_count_ssd
= 0;
59 uint32_t vm_phantom_object_id
= 1;
60 #define VM_PHANTOM_OBJECT_ID_AFTER_WRAP 1000000
62 vm_ghost_t vm_phantom_cache
;
63 uint32_t vm_phantom_cache_nindx
= 1;
64 uint32_t vm_phantom_cache_num_entries
= 0;
65 uint32_t vm_phantom_cache_size
;
67 typedef uint32_t vm_phantom_hash_entry_t
;
68 vm_phantom_hash_entry_t
*vm_phantom_cache_hash
;
69 uint32_t vm_phantom_cache_hash_size
;
70 uint32_t vm_ghost_hash_mask
; /* Mask for hash function */
71 uint32_t vm_ghost_bucket_hash
; /* Basic bucket hash */
79 #define vm_phantom_hash(obj_id, offset) (\
80 ( (natural_t)((uintptr_t)obj_id * vm_ghost_bucket_hash) + (offset ^ vm_ghost_bucket_hash)) & vm_ghost_hash_mask)
83 struct phantom_cache_stats
{
85 uint32_t pcs_added_page_to_entry
;
86 uint32_t pcs_added_new_entry
;
87 uint32_t pcs_replaced_entry
;
89 uint32_t pcs_lookup_found_page_in_cache
;
90 uint32_t pcs_lookup_entry_not_in_cache
;
91 uint32_t pcs_lookup_page_not_in_entry
;
93 uint32_t pcs_updated_phantom_state
;
94 } phantom_cache_stats
;
98 vm_phantom_cache_init()
100 unsigned int num_entries
;
104 num_entries
= (uint32_t)(((max_mem
/ PAGE_SIZE
) / 4) / VM_GHOST_PAGES_PER_ENTRY
);
105 vm_phantom_cache_num_entries
= 1;
107 while (vm_phantom_cache_num_entries
< num_entries
)
108 vm_phantom_cache_num_entries
<<= 1;
110 vm_phantom_cache_size
= sizeof(struct vm_ghost
) * vm_phantom_cache_num_entries
;
111 vm_phantom_cache_hash_size
= sizeof(vm_phantom_hash_entry_t
) * vm_phantom_cache_num_entries
;
113 if (kernel_memory_allocate(kernel_map
, (vm_offset_t
*)(&vm_phantom_cache
), vm_phantom_cache_size
, 0, KMA_KOBJECT
) != KERN_SUCCESS
)
114 panic("vm_phantom_cache_init: kernel_memory_allocate failed\n");
115 bzero(vm_phantom_cache
, vm_phantom_cache_size
);
117 if (kernel_memory_allocate(kernel_map
, (vm_offset_t
*)(&vm_phantom_cache_hash
), vm_phantom_cache_hash_size
, 0, KMA_KOBJECT
) != KERN_SUCCESS
)
118 panic("vm_phantom_cache_init: kernel_memory_allocate failed\n");
119 bzero(vm_phantom_cache_hash
, vm_phantom_cache_hash_size
);
122 vm_ghost_hash_mask
= vm_phantom_cache_num_entries
- 1;
125 * Calculate object_id shift value for hashing algorithm:
126 * O = log2(sizeof(struct vm_object))
127 * B = log2(vm_page_bucket_count)
128 * hash shifts the object_id left by
131 size
= vm_phantom_cache_num_entries
;
132 for (log1
= 0; size
> 1; log1
++)
135 vm_ghost_bucket_hash
= 1 << ((log1
+ 1) >> 1); /* Get (ceiling of sqrt of table size) */
136 vm_ghost_bucket_hash
|= 1 << ((log1
+ 1) >> 2); /* Get (ceiling of quadroot of table size) */
137 vm_ghost_bucket_hash
|= 1; /* Set bit and add 1 - always must be 1 to insure unique series */
139 if (vm_ghost_hash_mask
& vm_phantom_cache_num_entries
)
140 printf("vm_phantom_cache_init: WARNING -- strange page hash\n");
145 vm_phantom_cache_add_ghost(vm_page_t m
)
150 boolean_t isSSD
= FALSE
;
151 vm_phantom_hash_entry_t ghost_hash_index
;
153 #if MACH_ASSERT || DEBUG
154 lck_mtx_assert(&vm_page_queue_lock
, LCK_MTX_ASSERT_OWNED
);
155 vm_object_lock_assert_exclusive(m
->object
);
158 if (vm_phantom_cache_num_entries
== 0)
161 pg_mask
= pg_masks
[(m
->offset
>> PAGE_SHIFT
) & VM_GHOST_PAGE_MASK
];
163 if (m
->object
->phantom_object_id
== 0) {
165 vnode_pager_get_isSSD(m
->object
->pager
, &isSSD
);
168 m
->object
->phantom_isssd
= TRUE
;
170 m
->object
->phantom_object_id
= vm_phantom_object_id
++;
172 if (vm_phantom_object_id
== 0)
173 vm_phantom_object_id
= VM_PHANTOM_OBJECT_ID_AFTER_WRAP
;
175 if ( (vpce
= vm_phantom_cache_lookup_ghost(m
, 0)) ) {
176 vpce
->g_pages_held
|= pg_mask
;
178 phantom_cache_stats
.pcs_added_page_to_entry
++;
183 * if we're here then the vm_ghost_t of this vm_page_t
184 * is not present in the phantom cache... take the next
185 * available entry in the LRU first evicting the existing
186 * entry if we've wrapped the ring
188 ghost_index
= vm_phantom_cache_nindx
++;
190 if (vm_phantom_cache_nindx
== vm_phantom_cache_num_entries
) {
191 vm_phantom_cache_nindx
= 1;
193 phantom_cache_stats
.pcs_wrapped
++;
195 vpce
= &vm_phantom_cache
[ghost_index
];
197 if (vpce
->g_obj_id
) {
199 * we're going to replace an existing entry
200 * so first remove it from the hash
204 ghost_hash_index
= vm_phantom_hash(vpce
->g_obj_id
, vpce
->g_obj_offset
);
206 nvpce
= &vm_phantom_cache
[vm_phantom_cache_hash
[ghost_hash_index
]];
209 vm_phantom_cache_hash
[ghost_hash_index
] = vpce
->g_next_index
;
212 if (nvpce
->g_next_index
== 0)
213 panic("didn't find ghost in hash\n");
215 if (&vm_phantom_cache
[nvpce
->g_next_index
] == vpce
) {
216 nvpce
->g_next_index
= vpce
->g_next_index
;
219 nvpce
= &vm_phantom_cache
[nvpce
->g_next_index
];
222 phantom_cache_stats
.pcs_replaced_entry
++;
224 phantom_cache_stats
.pcs_added_new_entry
++;
226 vpce
->g_pages_held
= pg_mask
;
227 vpce
->g_obj_offset
= (m
->offset
>> (PAGE_SHIFT
+ VM_GHOST_PAGE_SHIFT
)) & VM_GHOST_OFFSET_MASK
;
228 vpce
->g_obj_id
= m
->object
->phantom_object_id
;
230 ghost_hash_index
= vm_phantom_hash(vpce
->g_obj_id
, vpce
->g_obj_offset
);
231 vpce
->g_next_index
= vm_phantom_cache_hash
[ghost_hash_index
];
232 vm_phantom_cache_hash
[ghost_hash_index
] = ghost_index
;
235 if (m
->object
->phantom_isssd
)
236 OSAddAtomic(1, &sample_period_ghost_added_count_ssd
);
238 OSAddAtomic(1, &sample_period_ghost_added_count
);
243 vm_phantom_cache_lookup_ghost(vm_page_t m
, uint32_t pg_mask
)
245 uint64_t g_obj_offset
;
247 uint32_t ghost_index
;
249 if ((g_obj_id
= m
->object
->phantom_object_id
) == 0) {
251 * no entries in phantom cache for this object
255 g_obj_offset
= (m
->offset
>> (PAGE_SHIFT
+ VM_GHOST_PAGE_SHIFT
)) & VM_GHOST_OFFSET_MASK
;
257 ghost_index
= vm_phantom_cache_hash
[vm_phantom_hash(g_obj_id
, g_obj_offset
)];
259 while (ghost_index
) {
262 vpce
= &vm_phantom_cache
[ghost_index
];
264 if (vpce
->g_obj_id
== g_obj_id
&& vpce
->g_obj_offset
== g_obj_offset
) {
266 if (pg_mask
== 0 || (vpce
->g_pages_held
& pg_mask
)) {
267 phantom_cache_stats
.pcs_lookup_found_page_in_cache
++;
271 phantom_cache_stats
.pcs_lookup_page_not_in_entry
++;
275 ghost_index
= vpce
->g_next_index
;
277 phantom_cache_stats
.pcs_lookup_entry_not_in_cache
++;
285 vm_phantom_cache_update(vm_page_t m
)
290 #if MACH_ASSERT || DEBUG
291 lck_mtx_assert(&vm_page_queue_lock
, LCK_MTX_ASSERT_OWNED
);
292 vm_object_lock_assert_exclusive(m
->object
);
295 if (vm_phantom_cache_num_entries
== 0)
298 pg_mask
= pg_masks
[(m
->offset
>> PAGE_SHIFT
) & VM_GHOST_PAGE_MASK
];
300 if ( (vpce
= vm_phantom_cache_lookup_ghost(m
, pg_mask
)) ) {
302 vpce
->g_pages_held
&= ~pg_mask
;
304 phantom_cache_stats
.pcs_updated_phantom_state
++;
306 if (m
->object
->phantom_isssd
)
307 OSAddAtomic(1, &sample_period_ghost_found_count_ssd
);
309 OSAddAtomic(1, &sample_period_ghost_found_count
);
314 #define PHANTOM_CACHE_DEBUG 1
316 #if PHANTOM_CACHE_DEBUG
318 int sample_period_ghost_counts_indx
= 0;
326 boolean_t pressure_detected
;
327 } sample_period_ghost_counts
[256];
332 * Determine if the file cache is thrashing from sampling interval statistics.
334 * Pages added to the phantom cache = pages evicted from the file cache.
335 * Pages found in the phantom cache = reads of pages that were recently evicted.
336 * Threshold is the latency-dependent number of reads we consider thrashing.
339 is_thrashing(uint32_t added
, uint32_t found
, uint32_t threshold
)
341 /* Ignore normal activity below the threshold. */
342 if (added
< threshold
|| found
< threshold
)
346 * When thrashing in a way that we can mitigate, most of the pages read
347 * into the file cache were recently evicted, and 'found' will be close
350 * When replacing the current working set because a new app is
351 * launched, we see very high read traffic with sporadic phantom cache
354 * This is not thrashing, or freeing up memory wouldn't help much
357 if (found
< added
/ 2)
364 * the following function is never called
365 * from multiple threads simultaneously due
366 * to a condition variable used to serialize
367 * at the compressor level... thus no need
368 * to provide locking for the sample processing
371 vm_phantom_cache_check_pressure()
373 clock_sec_t cur_ts_sec
;
374 clock_nsec_t cur_ts_nsec
;
375 uint64_t elapsed_msecs_in_eval
;
376 boolean_t pressure_detected
= FALSE
;
378 clock_get_system_nanotime(&cur_ts_sec
, &cur_ts_nsec
);
380 elapsed_msecs_in_eval
= vm_compressor_compute_elapsed_msecs(cur_ts_sec
, cur_ts_nsec
, pc_start_of_eval_period_sec
, pc_start_of_eval_period_nsec
);
383 * Reset evaluation period after phantom_cache_eval_period_in_msecs or
384 * whenever vm_phantom_cache_restart_sample has been called.
386 if (elapsed_msecs_in_eval
>= phantom_cache_eval_period_in_msecs
) {
387 pc_need_eval_reset
= TRUE
;
390 if (pc_need_eval_reset
== TRUE
) {
392 #if PHANTOM_CACHE_DEBUG
394 * maintain some info about the last 256 sample periods
396 sample_period_ghost_counts
[sample_period_ghost_counts_indx
].added
= sample_period_ghost_added_count
;
397 sample_period_ghost_counts
[sample_period_ghost_counts_indx
].found
= sample_period_ghost_found_count
;
398 sample_period_ghost_counts
[sample_period_ghost_counts_indx
].added_ssd
= sample_period_ghost_added_count_ssd
;
399 sample_period_ghost_counts
[sample_period_ghost_counts_indx
].found_ssd
= sample_period_ghost_found_count_ssd
;
400 sample_period_ghost_counts
[sample_period_ghost_counts_indx
].elapsed_ms
= (uint32_t)elapsed_msecs_in_eval
;
402 sample_period_ghost_counts_indx
++;
404 if (sample_period_ghost_counts_indx
>= 256)
405 sample_period_ghost_counts_indx
= 0;
407 sample_period_ghost_added_count
= 0;
408 sample_period_ghost_found_count
= 0;
409 sample_period_ghost_added_count_ssd
= 0;
410 sample_period_ghost_found_count_ssd
= 0;
412 pc_start_of_eval_period_sec
= cur_ts_sec
;
413 pc_start_of_eval_period_nsec
= cur_ts_nsec
;
415 pc_need_eval_reset
= FALSE
;
418 * Since the trashing rate is really a function of the read latency of the disk
419 * we have to consider both the SSD and spinning disk case since the file cache
420 * could be backed by either or even both flavors. When the object is first
421 * assigned a phantom_object_id, we query the pager to determine if the backing
422 * backing media is an SSD and remember that answer in the vm_object. We use
423 * that info to maintains counts for both the SSD and spinning disk cases.
425 if (is_thrashing(sample_period_ghost_added_count
,
426 sample_period_ghost_found_count
,
427 phantom_cache_thrashing_threshold
) ||
428 is_thrashing(sample_period_ghost_added_count_ssd
,
429 sample_period_ghost_found_count_ssd
,
430 phantom_cache_thrashing_threshold_ssd
)) {
431 /* Thrashing in the current period: Set bit 0. */
437 * Declare pressure_detected after phantom_cache_contiguous_periods.
439 * Create a bitmask with the N low bits set. These bits must all be set
440 * in pc_history. The high bits of pc_history are ignored.
442 uint32_t bitmask
= (1u << phantom_cache_contiguous_periods
) - 1;
443 if ((pc_history
& bitmask
) == bitmask
)
444 pressure_detected
= TRUE
;
446 if (vm_page_external_count
> ((AVAILABLE_MEMORY
) * 50) / 100)
447 pressure_detected
= FALSE
;
449 #if PHANTOM_CACHE_DEBUG
450 sample_period_ghost_counts
[sample_period_ghost_counts_indx
].pressure_detected
= pressure_detected
;
452 return (pressure_detected
);
456 * Restart the current sampling because conditions have changed significantly,
457 * and we don't want to react to old data.
459 * This function can be called from any thread.
462 vm_phantom_cache_restart_sample(void)
464 pc_need_eval_reset
= TRUE
;