2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <vm/vm_page.h>
30 #include <vm/vm_object.h>
31 #include <vm/vm_kern.h>
32 #include <vm/vm_pageout.h>
33 #include <vm/vm_phantom_cache.h>
34 #include <vm/vm_compressor.h>
37 uint32_t phantom_cache_eval_period_in_msecs
= 250;
38 uint32_t phantom_cache_thrashing_threshold_ssd
= 1000;
39 uint32_t phantom_cache_thrashing_threshold
= 100;
42 * Number of consecutive thrashing periods required before
43 * vm_phantom_cache_check_pressure() returns true.
45 unsigned phantom_cache_contiguous_periods
= 2;
47 clock_sec_t pc_start_of_eval_period_sec
= 0;
48 clock_nsec_t pc_start_of_eval_period_nsec
= 0;
49 boolean_t pc_need_eval_reset
= FALSE
;
51 /* One bit per recent sampling period. Bit 0 = current period. */
52 uint32_t pc_history
= 0;
54 uint32_t sample_period_ghost_added_count
= 0;
55 uint32_t sample_period_ghost_added_count_ssd
= 0;
56 uint32_t sample_period_ghost_found_count
= 0;
57 uint32_t sample_period_ghost_found_count_ssd
= 0;
59 uint32_t vm_phantom_object_id
= 1;
60 #define VM_PHANTOM_OBJECT_ID_AFTER_WRAP 1000000
62 vm_ghost_t vm_phantom_cache
;
63 uint32_t vm_phantom_cache_nindx
= 1;
64 uint32_t vm_phantom_cache_num_entries
= 0;
65 uint32_t vm_phantom_cache_size
;
67 typedef uint32_t vm_phantom_hash_entry_t
;
68 vm_phantom_hash_entry_t
*vm_phantom_cache_hash
;
69 uint32_t vm_phantom_cache_hash_size
;
70 uint32_t vm_ghost_hash_mask
; /* Mask for hash function */
71 uint32_t vm_ghost_bucket_hash
; /* Basic bucket hash */
79 #define vm_phantom_hash(obj_id, offset) (\
80 ( (natural_t)((uintptr_t)obj_id * vm_ghost_bucket_hash) + (offset ^ vm_ghost_bucket_hash)) & vm_ghost_hash_mask)
83 struct phantom_cache_stats
{
85 uint32_t pcs_added_page_to_entry
;
86 uint32_t pcs_added_new_entry
;
87 uint32_t pcs_replaced_entry
;
89 uint32_t pcs_lookup_found_page_in_cache
;
90 uint32_t pcs_lookup_entry_not_in_cache
;
91 uint32_t pcs_lookup_page_not_in_entry
;
93 uint32_t pcs_updated_phantom_state
;
94 } phantom_cache_stats
;
98 vm_phantom_cache_init()
100 unsigned int num_entries
;
104 if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE
)
106 num_entries
= (uint32_t)(((max_mem
/ PAGE_SIZE
) / 4) / VM_GHOST_PAGES_PER_ENTRY
);
107 vm_phantom_cache_num_entries
= 1;
109 while (vm_phantom_cache_num_entries
< num_entries
)
110 vm_phantom_cache_num_entries
<<= 1;
112 vm_phantom_cache_size
= sizeof(struct vm_ghost
) * vm_phantom_cache_num_entries
;
113 vm_phantom_cache_hash_size
= sizeof(vm_phantom_hash_entry_t
) * vm_phantom_cache_num_entries
;
115 if (kernel_memory_allocate(kernel_map
, (vm_offset_t
*)(&vm_phantom_cache
), vm_phantom_cache_size
, 0, KMA_KOBJECT
| KMA_PERMANENT
, VM_KERN_MEMORY_PHANTOM_CACHE
) != KERN_SUCCESS
)
116 panic("vm_phantom_cache_init: kernel_memory_allocate failed\n");
117 bzero(vm_phantom_cache
, vm_phantom_cache_size
);
119 if (kernel_memory_allocate(kernel_map
, (vm_offset_t
*)(&vm_phantom_cache_hash
), vm_phantom_cache_hash_size
, 0, KMA_KOBJECT
| KMA_PERMANENT
, VM_KERN_MEMORY_PHANTOM_CACHE
) != KERN_SUCCESS
)
120 panic("vm_phantom_cache_init: kernel_memory_allocate failed\n");
121 bzero(vm_phantom_cache_hash
, vm_phantom_cache_hash_size
);
124 vm_ghost_hash_mask
= vm_phantom_cache_num_entries
- 1;
127 * Calculate object_id shift value for hashing algorithm:
128 * O = log2(sizeof(struct vm_object))
129 * B = log2(vm_page_bucket_count)
130 * hash shifts the object_id left by
133 size
= vm_phantom_cache_num_entries
;
134 for (log1
= 0; size
> 1; log1
++)
137 vm_ghost_bucket_hash
= 1 << ((log1
+ 1) >> 1); /* Get (ceiling of sqrt of table size) */
138 vm_ghost_bucket_hash
|= 1 << ((log1
+ 1) >> 2); /* Get (ceiling of quadroot of table size) */
139 vm_ghost_bucket_hash
|= 1; /* Set bit and add 1 - always must be 1 to insure unique series */
141 if (vm_ghost_hash_mask
& vm_phantom_cache_num_entries
)
142 printf("vm_phantom_cache_init: WARNING -- strange page hash\n");
147 vm_phantom_cache_add_ghost(vm_page_t m
)
153 boolean_t isSSD
= FALSE
;
154 vm_phantom_hash_entry_t ghost_hash_index
;
156 object
= VM_PAGE_OBJECT(m
);
158 LCK_MTX_ASSERT(&vm_page_queue_lock
, LCK_MTX_ASSERT_OWNED
);
159 vm_object_lock_assert_exclusive(object
);
161 if (vm_phantom_cache_num_entries
== 0)
164 pg_mask
= pg_masks
[(m
->offset
>> PAGE_SHIFT
) & VM_GHOST_PAGE_MASK
];
166 if (object
->phantom_object_id
== 0) {
168 vnode_pager_get_isSSD(object
->pager
, &isSSD
);
171 object
->phantom_isssd
= TRUE
;
173 object
->phantom_object_id
= vm_phantom_object_id
++;
175 if (vm_phantom_object_id
== 0)
176 vm_phantom_object_id
= VM_PHANTOM_OBJECT_ID_AFTER_WRAP
;
178 if ( (vpce
= vm_phantom_cache_lookup_ghost(m
, 0)) ) {
179 vpce
->g_pages_held
|= pg_mask
;
181 phantom_cache_stats
.pcs_added_page_to_entry
++;
186 * if we're here then the vm_ghost_t of this vm_page_t
187 * is not present in the phantom cache... take the next
188 * available entry in the LRU first evicting the existing
189 * entry if we've wrapped the ring
191 ghost_index
= vm_phantom_cache_nindx
++;
193 if (vm_phantom_cache_nindx
== vm_phantom_cache_num_entries
) {
194 vm_phantom_cache_nindx
= 1;
196 phantom_cache_stats
.pcs_wrapped
++;
198 vpce
= &vm_phantom_cache
[ghost_index
];
200 if (vpce
->g_obj_id
) {
202 * we're going to replace an existing entry
203 * so first remove it from the hash
207 ghost_hash_index
= vm_phantom_hash(vpce
->g_obj_id
, vpce
->g_obj_offset
);
209 nvpce
= &vm_phantom_cache
[vm_phantom_cache_hash
[ghost_hash_index
]];
212 vm_phantom_cache_hash
[ghost_hash_index
] = vpce
->g_next_index
;
215 if (nvpce
->g_next_index
== 0)
216 panic("didn't find ghost in hash\n");
218 if (&vm_phantom_cache
[nvpce
->g_next_index
] == vpce
) {
219 nvpce
->g_next_index
= vpce
->g_next_index
;
222 nvpce
= &vm_phantom_cache
[nvpce
->g_next_index
];
225 phantom_cache_stats
.pcs_replaced_entry
++;
227 phantom_cache_stats
.pcs_added_new_entry
++;
229 vpce
->g_pages_held
= pg_mask
;
230 vpce
->g_obj_offset
= (m
->offset
>> (PAGE_SHIFT
+ VM_GHOST_PAGE_SHIFT
)) & VM_GHOST_OFFSET_MASK
;
231 vpce
->g_obj_id
= object
->phantom_object_id
;
233 ghost_hash_index
= vm_phantom_hash(vpce
->g_obj_id
, vpce
->g_obj_offset
);
234 vpce
->g_next_index
= vm_phantom_cache_hash
[ghost_hash_index
];
235 vm_phantom_cache_hash
[ghost_hash_index
] = ghost_index
;
238 if (object
->phantom_isssd
)
239 OSAddAtomic(1, &sample_period_ghost_added_count_ssd
);
241 OSAddAtomic(1, &sample_period_ghost_added_count
);
246 vm_phantom_cache_lookup_ghost(vm_page_t m
, uint32_t pg_mask
)
248 uint64_t g_obj_offset
;
250 uint32_t ghost_index
;
253 object
= VM_PAGE_OBJECT(m
);
255 if ((g_obj_id
= object
->phantom_object_id
) == 0) {
257 * no entries in phantom cache for this object
261 g_obj_offset
= (m
->offset
>> (PAGE_SHIFT
+ VM_GHOST_PAGE_SHIFT
)) & VM_GHOST_OFFSET_MASK
;
263 ghost_index
= vm_phantom_cache_hash
[vm_phantom_hash(g_obj_id
, g_obj_offset
)];
265 while (ghost_index
) {
268 vpce
= &vm_phantom_cache
[ghost_index
];
270 if (vpce
->g_obj_id
== g_obj_id
&& vpce
->g_obj_offset
== g_obj_offset
) {
272 if (pg_mask
== 0 || (vpce
->g_pages_held
& pg_mask
)) {
273 phantom_cache_stats
.pcs_lookup_found_page_in_cache
++;
277 phantom_cache_stats
.pcs_lookup_page_not_in_entry
++;
281 ghost_index
= vpce
->g_next_index
;
283 phantom_cache_stats
.pcs_lookup_entry_not_in_cache
++;
291 vm_phantom_cache_update(vm_page_t m
)
297 object
= VM_PAGE_OBJECT(m
);
299 LCK_MTX_ASSERT(&vm_page_queue_lock
, LCK_MTX_ASSERT_OWNED
);
300 vm_object_lock_assert_exclusive(object
);
302 if (vm_phantom_cache_num_entries
== 0)
305 pg_mask
= pg_masks
[(m
->offset
>> PAGE_SHIFT
) & VM_GHOST_PAGE_MASK
];
307 if ( (vpce
= vm_phantom_cache_lookup_ghost(m
, pg_mask
)) ) {
309 vpce
->g_pages_held
&= ~pg_mask
;
311 phantom_cache_stats
.pcs_updated_phantom_state
++;
313 if (object
->phantom_isssd
)
314 OSAddAtomic(1, &sample_period_ghost_found_count_ssd
);
316 OSAddAtomic(1, &sample_period_ghost_found_count
);
321 #define PHANTOM_CACHE_DEBUG 1
323 #if PHANTOM_CACHE_DEBUG
325 int sample_period_ghost_counts_indx
= 0;
333 boolean_t pressure_detected
;
334 } sample_period_ghost_counts
[256];
339 * Determine if the file cache is thrashing from sampling interval statistics.
341 * Pages added to the phantom cache = pages evicted from the file cache.
342 * Pages found in the phantom cache = reads of pages that were recently evicted.
343 * Threshold is the latency-dependent number of reads we consider thrashing.
346 is_thrashing(uint32_t added
, uint32_t found
, uint32_t threshold
)
348 /* Ignore normal activity below the threshold. */
349 if (added
< threshold
|| found
< threshold
)
353 * When thrashing in a way that we can mitigate, most of the pages read
354 * into the file cache were recently evicted, and 'found' will be close
357 * When replacing the current working set because a new app is
358 * launched, we see very high read traffic with sporadic phantom cache
361 * This is not thrashing, or freeing up memory wouldn't help much
364 if (found
< added
/ 2)
371 * the following function is never called
372 * from multiple threads simultaneously due
373 * to a condition variable used to serialize
374 * at the compressor level... thus no need
375 * to provide locking for the sample processing
378 vm_phantom_cache_check_pressure()
380 clock_sec_t cur_ts_sec
;
381 clock_nsec_t cur_ts_nsec
;
382 uint64_t elapsed_msecs_in_eval
;
383 boolean_t pressure_detected
= FALSE
;
385 clock_get_system_nanotime(&cur_ts_sec
, &cur_ts_nsec
);
387 elapsed_msecs_in_eval
= vm_compressor_compute_elapsed_msecs(cur_ts_sec
, cur_ts_nsec
, pc_start_of_eval_period_sec
, pc_start_of_eval_period_nsec
);
390 * Reset evaluation period after phantom_cache_eval_period_in_msecs or
391 * whenever vm_phantom_cache_restart_sample has been called.
393 if (elapsed_msecs_in_eval
>= phantom_cache_eval_period_in_msecs
) {
394 pc_need_eval_reset
= TRUE
;
397 if (pc_need_eval_reset
== TRUE
) {
399 #if PHANTOM_CACHE_DEBUG
401 * maintain some info about the last 256 sample periods
403 sample_period_ghost_counts
[sample_period_ghost_counts_indx
].added
= sample_period_ghost_added_count
;
404 sample_period_ghost_counts
[sample_period_ghost_counts_indx
].found
= sample_period_ghost_found_count
;
405 sample_period_ghost_counts
[sample_period_ghost_counts_indx
].added_ssd
= sample_period_ghost_added_count_ssd
;
406 sample_period_ghost_counts
[sample_period_ghost_counts_indx
].found_ssd
= sample_period_ghost_found_count_ssd
;
407 sample_period_ghost_counts
[sample_period_ghost_counts_indx
].elapsed_ms
= (uint32_t)elapsed_msecs_in_eval
;
409 sample_period_ghost_counts_indx
++;
411 if (sample_period_ghost_counts_indx
>= 256)
412 sample_period_ghost_counts_indx
= 0;
414 sample_period_ghost_added_count
= 0;
415 sample_period_ghost_found_count
= 0;
416 sample_period_ghost_added_count_ssd
= 0;
417 sample_period_ghost_found_count_ssd
= 0;
419 pc_start_of_eval_period_sec
= cur_ts_sec
;
420 pc_start_of_eval_period_nsec
= cur_ts_nsec
;
422 pc_need_eval_reset
= FALSE
;
425 * Since the trashing rate is really a function of the read latency of the disk
426 * we have to consider both the SSD and spinning disk case since the file cache
427 * could be backed by either or even both flavors. When the object is first
428 * assigned a phantom_object_id, we query the pager to determine if the backing
429 * backing media is an SSD and remember that answer in the vm_object. We use
430 * that info to maintains counts for both the SSD and spinning disk cases.
432 if (is_thrashing(sample_period_ghost_added_count
,
433 sample_period_ghost_found_count
,
434 phantom_cache_thrashing_threshold
) ||
435 is_thrashing(sample_period_ghost_added_count_ssd
,
436 sample_period_ghost_found_count_ssd
,
437 phantom_cache_thrashing_threshold_ssd
)) {
438 /* Thrashing in the current period: Set bit 0. */
444 * Declare pressure_detected after phantom_cache_contiguous_periods.
446 * Create a bitmask with the N low bits set. These bits must all be set
447 * in pc_history. The high bits of pc_history are ignored.
449 uint32_t bitmask
= (1u << phantom_cache_contiguous_periods
) - 1;
450 if ((pc_history
& bitmask
) == bitmask
)
451 pressure_detected
= TRUE
;
453 if (vm_page_external_count
> ((AVAILABLE_MEMORY
) * 50) / 100)
454 pressure_detected
= FALSE
;
456 #if PHANTOM_CACHE_DEBUG
457 sample_period_ghost_counts
[sample_period_ghost_counts_indx
].pressure_detected
= pressure_detected
;
459 return (pressure_detected
);
463 * Restart the current sampling because conditions have changed significantly,
464 * and we don't want to react to old data.
466 * This function can be called from any thread.
469 vm_phantom_cache_restart_sample(void)
471 pc_need_eval_reset
= TRUE
;