osfmk/vm/vm_phantom_cache.c

   1 /*
   2  * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <vm/vm_page.h>
  30 #include <vm/vm_object.h>
  31 #include <vm/vm_kern.h>
  32 #include <vm/vm_pageout.h>
  33 #include <vm/vm_phantom_cache.h>
  34 #include <vm/vm_compressor.h>
  35
  36
  37 uint32_t phantom_cache_eval_period_in_msecs = 250;
  38 uint32_t phantom_cache_thrashing_threshold_ssd = 1000;
  39 uint32_t phantom_cache_thrashing_threshold = 100;
  40
  41 /*
  42  * Number of consecutive thrashing periods required before
  43  * vm_phantom_cache_check_pressure() returns true.
  44  */
  45 unsigned phantom_cache_contiguous_periods = 2;
  46
  47 clock_sec_t     pc_start_of_eval_period_sec = 0;
  48 clock_nsec_t    pc_start_of_eval_period_nsec = 0;
  49 boolean_t       pc_need_eval_reset = FALSE;
  50
  51 /* One bit per recent sampling period. Bit 0 = current period. */
  52 uint32_t        pc_history = 0;
  53
  54 uint32_t        sample_period_ghost_added_count = 0;
  55 uint32_t        sample_period_ghost_added_count_ssd = 0;
  56 uint32_t        sample_period_ghost_found_count = 0;
  57 uint32_t        sample_period_ghost_found_count_ssd = 0;
  58
  59 uint32_t        vm_phantom_object_id = 1;
  60 #define         VM_PHANTOM_OBJECT_ID_AFTER_WRAP 1000000
  61
  62 vm_ghost_t      vm_phantom_cache;
  63 uint32_t        vm_phantom_cache_nindx = 1;
  64 uint32_t        vm_phantom_cache_num_entries = 0;
  65 uint32_t        vm_phantom_cache_size;
  66
  67 typedef uint32_t        vm_phantom_hash_entry_t;
  68 vm_phantom_hash_entry_t *vm_phantom_cache_hash;
  69 uint32_t        vm_phantom_cache_hash_size;
  70 uint32_t        vm_ghost_hash_mask;             /* Mask for hash function */
  71 uint32_t        vm_ghost_bucket_hash;           /* Basic bucket hash */
  72
  73
  74 int pg_masks[4] = {
  75         0x1, 0x2, 0x4, 0x8
  76 };
  77
  78
  79 #define vm_phantom_hash(obj_id, offset) (\
  80                 ( (natural_t)((uintptr_t)obj_id * vm_ghost_bucket_hash) + (offset ^ vm_ghost_bucket_hash)) & vm_ghost_hash_mask)
  81
  82
  83 struct phantom_cache_stats {
  84         uint32_t        pcs_wrapped;
  85         uint32_t        pcs_added_page_to_entry;
  86         uint32_t        pcs_added_new_entry;
  87         uint32_t        pcs_replaced_entry;
  88
  89         uint32_t        pcs_lookup_found_page_in_cache;
  90         uint32_t        pcs_lookup_entry_not_in_cache;
  91         uint32_t        pcs_lookup_page_not_in_entry;
  92
  93         uint32_t        pcs_updated_phantom_state;
  94 } phantom_cache_stats;
  95
  96
  97 void
  98 vm_phantom_cache_init()
  99 {
 100         unsigned int    num_entries;
 101         unsigned int    log1;
 102         unsigned int    size;
 103
 104         if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE)
 105                 return;
 106         num_entries = (uint32_t)(((max_mem / PAGE_SIZE) / 4) / VM_GHOST_PAGES_PER_ENTRY);
 107         vm_phantom_cache_num_entries = 1;
 108
 109         while (vm_phantom_cache_num_entries < num_entries)
 110                 vm_phantom_cache_num_entries <<= 1;
 111
 112         vm_phantom_cache_size = sizeof(struct vm_ghost) * vm_phantom_cache_num_entries;
 113         vm_phantom_cache_hash_size = sizeof(vm_phantom_hash_entry_t) * vm_phantom_cache_num_entries;
 114
 115         if (kernel_memory_allocate(kernel_map, (vm_offset_t *)(&vm_phantom_cache), vm_phantom_cache_size, 0, KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_PHANTOM_CACHE) != KERN_SUCCESS)
 116                 panic("vm_phantom_cache_init: kernel_memory_allocate failed\n");
 117         bzero(vm_phantom_cache, vm_phantom_cache_size);
 118
 119         if (kernel_memory_allocate(kernel_map, (vm_offset_t *)(&vm_phantom_cache_hash), vm_phantom_cache_hash_size, 0, KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_PHANTOM_CACHE) != KERN_SUCCESS)
 120                 panic("vm_phantom_cache_init: kernel_memory_allocate failed\n");
 121         bzero(vm_phantom_cache_hash, vm_phantom_cache_hash_size);
 122
 123
 124         vm_ghost_hash_mask = vm_phantom_cache_num_entries - 1;
 125
 126         /*
 127          *      Calculate object_id shift value for hashing algorithm:
 128          *              O = log2(sizeof(struct vm_object))
 129          *              B = log2(vm_page_bucket_count)
 130          *              hash shifts the object_id left by
 131          *              B/2 - O
 132          */
 133         size = vm_phantom_cache_num_entries;
 134         for (log1 = 0; size > 1; log1++)
 135                 size /= 2;
 136
 137         vm_ghost_bucket_hash = 1 << ((log1 + 1) >> 1);          /* Get (ceiling of sqrt of table size) */
 138         vm_ghost_bucket_hash |= 1 << ((log1 + 1) >> 2);         /* Get (ceiling of quadroot of table size) */
 139         vm_ghost_bucket_hash |= 1;                              /* Set bit and add 1 - always must be 1 to insure unique series */
 140
 141         if (vm_ghost_hash_mask & vm_phantom_cache_num_entries)
 142                 printf("vm_phantom_cache_init: WARNING -- strange page hash\n");
 143 }
 144
 145
 146 void
 147 vm_phantom_cache_add_ghost(vm_page_t m)
 148 {
 149         vm_ghost_t      vpce;
 150         vm_object_t     object;
 151         int             ghost_index;
 152         int             pg_mask;
 153         boolean_t       isSSD = FALSE;
 154         vm_phantom_hash_entry_t ghost_hash_index;
 155
 156         object = VM_PAGE_OBJECT(m);
 157
 158         LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
 159         vm_object_lock_assert_exclusive(object);
 160
 161         if (vm_phantom_cache_num_entries == 0)
 162                 return;
 163
 164         pg_mask = pg_masks[(m->offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK];
 165
 166         if (object->phantom_object_id == 0) {
 167
 168                 vnode_pager_get_isSSD(object->pager, &isSSD);
 169
 170                 if (isSSD == TRUE)
 171                         object->phantom_isssd = TRUE;
 172
 173                 object->phantom_object_id = vm_phantom_object_id++;
 174
 175                 if (vm_phantom_object_id == 0)
 176                         vm_phantom_object_id = VM_PHANTOM_OBJECT_ID_AFTER_WRAP;
 177         } else {
 178                 if ( (vpce = vm_phantom_cache_lookup_ghost(m, 0)) ) {
 179                         vpce->g_pages_held |= pg_mask;
 180
 181                         phantom_cache_stats.pcs_added_page_to_entry++;
 182                         goto done;
 183                 }
 184         }
 185         /*
 186          * if we're here then the vm_ghost_t of this vm_page_t
 187          * is not present in the phantom cache... take the next
 188          * available entry in the LRU first evicting the existing
 189          * entry if we've wrapped the ring
 190          */
 191         ghost_index = vm_phantom_cache_nindx++;
 192
 193         if (vm_phantom_cache_nindx == vm_phantom_cache_num_entries) {
 194                 vm_phantom_cache_nindx = 1;
 195
 196                 phantom_cache_stats.pcs_wrapped++;
 197         }
 198         vpce = &vm_phantom_cache[ghost_index];
 199
 200         if (vpce->g_obj_id) {
 201                 /*
 202                  * we're going to replace an existing entry
 203                  * so first remove it from the hash
 204                  */
 205                 vm_ghost_t      nvpce;
 206
 207                 ghost_hash_index = vm_phantom_hash(vpce->g_obj_id, vpce->g_obj_offset);
 208
 209                 nvpce = &vm_phantom_cache[vm_phantom_cache_hash[ghost_hash_index]];
 210
 211                 if (nvpce == vpce) {
 212                         vm_phantom_cache_hash[ghost_hash_index] = vpce->g_next_index;
 213                 } else {
 214                         for (;;) {
 215                                 if (nvpce->g_next_index == 0)
 216                                         panic("didn't find ghost in hash\n");
 217
 218                                 if (&vm_phantom_cache[nvpce->g_next_index] == vpce) {
 219                                         nvpce->g_next_index = vpce->g_next_index;
 220                                         break;
 221                                 }
 222                                 nvpce = &vm_phantom_cache[nvpce->g_next_index];
 223                         }
 224                 }
 225                 phantom_cache_stats.pcs_replaced_entry++;
 226         } else
 227                 phantom_cache_stats.pcs_added_new_entry++;
 228
 229         vpce->g_pages_held = pg_mask;
 230         vpce->g_obj_offset = (m->offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK;
 231         vpce->g_obj_id = object->phantom_object_id;
 232
 233         ghost_hash_index = vm_phantom_hash(vpce->g_obj_id, vpce->g_obj_offset);
 234         vpce->g_next_index = vm_phantom_cache_hash[ghost_hash_index];
 235         vm_phantom_cache_hash[ghost_hash_index] = ghost_index;
 236
 237 done:
 238         if (object->phantom_isssd)
 239                 OSAddAtomic(1, &sample_period_ghost_added_count_ssd);
 240         else
 241                 OSAddAtomic(1, &sample_period_ghost_added_count);
 242 }
 243
 244
 245 vm_ghost_t
 246 vm_phantom_cache_lookup_ghost(vm_page_t m, uint32_t pg_mask)
 247 {
 248         uint64_t        g_obj_offset;
 249         uint32_t        g_obj_id;
 250         uint32_t        ghost_index;
 251         vm_object_t     object;
 252
 253         object = VM_PAGE_OBJECT(m);
 254
 255         if ((g_obj_id = object->phantom_object_id) == 0) {
 256                 /*
 257                  * no entries in phantom cache for this object
 258                  */
 259                 return (NULL);
 260         }
 261         g_obj_offset = (m->offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK;
 262
 263         ghost_index = vm_phantom_cache_hash[vm_phantom_hash(g_obj_id, g_obj_offset)];
 264
 265         while (ghost_index) {
 266                 vm_ghost_t      vpce;
 267
 268                 vpce = &vm_phantom_cache[ghost_index];
 269
 270                 if (vpce->g_obj_id == g_obj_id && vpce->g_obj_offset == g_obj_offset) {
 271
 272                         if (pg_mask == 0 || (vpce->g_pages_held & pg_mask)) {
 273                                 phantom_cache_stats.pcs_lookup_found_page_in_cache++;
 274
 275                                 return (vpce);
 276                         }
 277                         phantom_cache_stats.pcs_lookup_page_not_in_entry++;
 278
 279                         return (NULL);
 280                 }
 281                 ghost_index = vpce->g_next_index;
 282         }
 283         phantom_cache_stats.pcs_lookup_entry_not_in_cache++;
 284
 285         return (NULL);
 286 }
 287
 288
 289
 290 void
 291 vm_phantom_cache_update(vm_page_t m)
 292 {
 293         int             pg_mask;
 294         vm_ghost_t      vpce;
 295         vm_object_t     object;
 296
 297         object = VM_PAGE_OBJECT(m);
 298
 299         LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
 300         vm_object_lock_assert_exclusive(object);
 301
 302         if (vm_phantom_cache_num_entries == 0)
 303                 return;
 304
 305         pg_mask = pg_masks[(m->offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK];
 306
 307         if ( (vpce = vm_phantom_cache_lookup_ghost(m, pg_mask)) ) {
 308
 309                 vpce->g_pages_held &= ~pg_mask;
 310
 311                 phantom_cache_stats.pcs_updated_phantom_state++;
 312
 313                 if (object->phantom_isssd)
 314                         OSAddAtomic(1, &sample_period_ghost_found_count_ssd);
 315                 else
 316                         OSAddAtomic(1, &sample_period_ghost_found_count);
 317         }
 318 }
 319
 320
 321 #define PHANTOM_CACHE_DEBUG     1
 322
 323 #if     PHANTOM_CACHE_DEBUG
 324
 325 int     sample_period_ghost_counts_indx = 0;
 326
 327 struct {
 328         uint32_t        added;
 329         uint32_t        found;
 330         uint32_t        added_ssd;
 331         uint32_t        found_ssd;
 332         uint32_t        elapsed_ms;
 333         boolean_t       pressure_detected;
 334 } sample_period_ghost_counts[256];
 335
 336 #endif
 337
 338 /*
 339  * Determine if the file cache is thrashing from sampling interval statistics.
 340  *
 341  * Pages added to the phantom cache = pages evicted from the file cache.
 342  * Pages found in the phantom cache = reads of pages that were recently evicted.
 343  * Threshold is the latency-dependent number of reads we consider thrashing.
 344  */
 345 static boolean_t
 346 is_thrashing(uint32_t added, uint32_t found, uint32_t threshold)
 347 {
 348         /* Ignore normal activity below the threshold. */
 349         if (added < threshold || found < threshold)
 350                 return FALSE;
 351
 352         /*
 353          * When thrashing in a way that we can mitigate, most of the pages read
 354          * into the file cache were recently evicted, and 'found' will be close
 355          * to 'added'.
 356          *
 357          * When replacing the current working set because a new app is
 358          * launched, we see very high read traffic with sporadic phantom cache
 359          * hits.
 360          *
 361          * This is not thrashing, or freeing up memory wouldn't help much
 362          * anyway.
 363          */
 364         if (found < added / 2)
 365                 return FALSE;
 366
 367         return TRUE;
 368 }
 369
 370 /*
 371  * the following function is never called
 372  * from multiple threads simultaneously due
 373  * to a condition variable used to serialize
 374  * at the compressor level... thus no need
 375  * to provide locking for the sample processing
 376  */
 377 boolean_t
 378 vm_phantom_cache_check_pressure()
 379 {
 380         clock_sec_t     cur_ts_sec;
 381         clock_nsec_t    cur_ts_nsec;
 382         uint64_t        elapsed_msecs_in_eval;
 383         boolean_t       pressure_detected = FALSE;
 384
 385         clock_get_system_nanotime(&cur_ts_sec, &cur_ts_nsec);
 386
 387         elapsed_msecs_in_eval = vm_compressor_compute_elapsed_msecs(cur_ts_sec, cur_ts_nsec, pc_start_of_eval_period_sec, pc_start_of_eval_period_nsec);
 388
 389         /*
 390          * Reset evaluation period after phantom_cache_eval_period_in_msecs or
 391          * whenever vm_phantom_cache_restart_sample has been called.
 392          */
 393         if (elapsed_msecs_in_eval >= phantom_cache_eval_period_in_msecs) {
 394                 pc_need_eval_reset = TRUE;
 395         }
 396
 397         if (pc_need_eval_reset == TRUE) {
 398
 399 #if PHANTOM_CACHE_DEBUG
 400                 /*
 401                  * maintain some info about the last 256 sample periods
 402                  */
 403                 sample_period_ghost_counts[sample_period_ghost_counts_indx].added = sample_period_ghost_added_count;
 404                 sample_period_ghost_counts[sample_period_ghost_counts_indx].found = sample_period_ghost_found_count;
 405                 sample_period_ghost_counts[sample_period_ghost_counts_indx].added_ssd = sample_period_ghost_added_count_ssd;
 406                 sample_period_ghost_counts[sample_period_ghost_counts_indx].found_ssd = sample_period_ghost_found_count_ssd;
 407                 sample_period_ghost_counts[sample_period_ghost_counts_indx].elapsed_ms = (uint32_t)elapsed_msecs_in_eval;
 408
 409                 sample_period_ghost_counts_indx++;
 410
 411                 if (sample_period_ghost_counts_indx >= 256)
 412                         sample_period_ghost_counts_indx = 0;
 413 #endif
 414                 sample_period_ghost_added_count = 0;
 415                 sample_period_ghost_found_count = 0;
 416                 sample_period_ghost_added_count_ssd = 0;
 417                 sample_period_ghost_found_count_ssd = 0;
 418
 419                 pc_start_of_eval_period_sec = cur_ts_sec;
 420                 pc_start_of_eval_period_nsec = cur_ts_nsec;
 421                 pc_history <<= 1;
 422                 pc_need_eval_reset = FALSE;
 423         } else {
 424                 /*
 425                  * Since the trashing rate is really a function of the read latency of the disk
 426                  * we have to consider both the SSD and spinning disk case since the file cache
 427                  * could be backed by either or even both flavors.  When the object is first
 428                  * assigned a phantom_object_id, we query the pager to determine if the backing
 429                  * backing media is an SSD and remember that answer in the vm_object.  We use
 430                  * that info to maintains counts for both the SSD and spinning disk cases.
 431                  */
 432                 if (is_thrashing(sample_period_ghost_added_count,
 433                                  sample_period_ghost_found_count,
 434                                  phantom_cache_thrashing_threshold) ||
 435                     is_thrashing(sample_period_ghost_added_count_ssd,
 436                                  sample_period_ghost_found_count_ssd,
 437                                  phantom_cache_thrashing_threshold_ssd)) {
 438                         /* Thrashing in the current period: Set bit 0. */
 439                         pc_history |= 1;
 440                 }
 441         }
 442
 443         /*
 444          * Declare pressure_detected after phantom_cache_contiguous_periods.
 445          *
 446          * Create a bitmask with the N low bits set. These bits must all be set
 447          * in pc_history. The high bits of pc_history are ignored.
 448          */
 449         uint32_t bitmask = (1u << phantom_cache_contiguous_periods) - 1;
 450         if ((pc_history & bitmask) == bitmask)
 451                 pressure_detected = TRUE;
 452
 453         if (vm_page_external_count > ((AVAILABLE_MEMORY) * 50) / 100)
 454                 pressure_detected = FALSE;
 455
 456 #if PHANTOM_CACHE_DEBUG
 457         sample_period_ghost_counts[sample_period_ghost_counts_indx].pressure_detected = pressure_detected;
 458 #endif
 459         return (pressure_detected);
 460 }
 461
 462 /*
 463  * Restart the current sampling because conditions have changed significantly,
 464  * and we don't want to react to old data.
 465  *
 466  * This function can be called from any thread.
 467  */
 468 void
 469 vm_phantom_cache_restart_sample(void)
 470 {
 471         pc_need_eval_reset = TRUE;
 472 }