osfmk/vm/vm_phantom_cache.c

   1 /*
   2  * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <vm/vm_page.h>
  30 #include <vm/vm_object.h>
  31 #include <vm/vm_kern.h>
  32 #include <vm/vm_pageout.h>
  33 #include <vm/vm_phantom_cache.h>
  34 #include <vm/vm_compressor.h>
  35
  36
  37 uint32_t phantom_cache_eval_period_in_msecs = 250;
  38 uint32_t phantom_cache_thrashing_threshold_ssd = 1000;
  39 #if !XNU_TARGET_OS_OSX
  40 uint32_t phantom_cache_thrashing_threshold = 500;
  41 #else /* !XNU_TARGET_OS_OSX */
  42 uint32_t phantom_cache_thrashing_threshold = 50;
  43 #endif /* !XNU_TARGET_OS_OSX */
  44
  45 /*
  46  * Number of consecutive thrashing periods required before
  47  * vm_phantom_cache_check_pressure() returns true.
  48  */
  49 #if !XNU_TARGET_OS_OSX
  50 unsigned phantom_cache_contiguous_periods = 4;
  51 #else /* !XNU_TARGET_OS_OSX */
  52 unsigned phantom_cache_contiguous_periods = 2;
  53 #endif /* !XNU_TARGET_OS_OSX */
  54
  55 clock_sec_t     pc_start_of_eval_period_sec = 0;
  56 clock_nsec_t    pc_start_of_eval_period_nsec = 0;
  57 boolean_t       pc_need_eval_reset = FALSE;
  58
  59 /* One bit per recent sampling period. Bit 0 = current period. */
  60 uint32_t        pc_history = 0;
  61
  62 uint32_t        sample_period_ghost_added_count = 0;
  63 uint32_t        sample_period_ghost_added_count_ssd = 0;
  64 uint32_t        sample_period_ghost_found_count = 0;
  65 uint32_t        sample_period_ghost_found_count_ssd = 0;
  66
  67 uint32_t        vm_phantom_object_id = 1;
  68 #define         VM_PHANTOM_OBJECT_ID_AFTER_WRAP 1000000
  69
  70 vm_ghost_t      vm_phantom_cache;
  71 uint32_t        vm_phantom_cache_nindx = 1;
  72 uint32_t        vm_phantom_cache_num_entries = 0;
  73 uint32_t        vm_phantom_cache_size;
  74
  75 typedef uint32_t        vm_phantom_hash_entry_t;
  76 vm_phantom_hash_entry_t *vm_phantom_cache_hash;
  77 uint32_t        vm_phantom_cache_hash_size;
  78 uint32_t        vm_ghost_hash_mask;             /* Mask for hash function */
  79 uint32_t        vm_ghost_bucket_hash;           /* Basic bucket hash */
  80
  81
  82 int pg_masks[4] = {
  83         0x1, 0x2, 0x4, 0x8
  84 };
  85
  86
  87 #define vm_phantom_hash(obj_id, offset) (\
  88                 ( (natural_t)((uintptr_t)obj_id * vm_ghost_bucket_hash) + (offset ^ vm_ghost_bucket_hash)) & vm_ghost_hash_mask)
  89
  90
  91 struct phantom_cache_stats {
  92         uint32_t        pcs_wrapped;
  93         uint32_t        pcs_added_page_to_entry;
  94         uint32_t        pcs_added_new_entry;
  95         uint32_t        pcs_replaced_entry;
  96
  97         uint32_t        pcs_lookup_found_page_in_cache;
  98         uint32_t        pcs_lookup_entry_not_in_cache;
  99         uint32_t        pcs_lookup_page_not_in_entry;
 100
 101         uint32_t        pcs_updated_phantom_state;
 102 } phantom_cache_stats;
 103
 104
 105
 106 void
 107 vm_phantom_cache_init()
 108 {
 109         unsigned int    num_entries;
 110         unsigned int    log1;
 111         unsigned int    size;
 112
 113         if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
 114                 return;
 115         }
 116 #if !XNU_TARGET_OS_OSX
 117         num_entries = (uint32_t)(((max_mem / PAGE_SIZE) / 10) / VM_GHOST_PAGES_PER_ENTRY);
 118 #else /* !XNU_TARGET_OS_OSX */
 119         num_entries = (uint32_t)(((max_mem / PAGE_SIZE) / 4) / VM_GHOST_PAGES_PER_ENTRY);
 120 #endif /* !XNU_TARGET_OS_OSX */
 121         vm_phantom_cache_num_entries = 1;
 122
 123         while (vm_phantom_cache_num_entries < num_entries) {
 124                 vm_phantom_cache_num_entries <<= 1;
 125         }
 126
 127         /*
 128          * We index this with g_next_index, so don't exceed the width of that bitfield.
 129          */
 130         if (vm_phantom_cache_num_entries > (1 << VM_GHOST_INDEX_BITS)) {
 131                 vm_phantom_cache_num_entries = (1 << VM_GHOST_INDEX_BITS);
 132         }
 133
 134         vm_phantom_cache_size = sizeof(struct vm_ghost) * vm_phantom_cache_num_entries;
 135         vm_phantom_cache_hash_size = sizeof(vm_phantom_hash_entry_t) * vm_phantom_cache_num_entries;
 136
 137         if (kernel_memory_allocate(kernel_map, (vm_offset_t *)(&vm_phantom_cache), vm_phantom_cache_size, 0, KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_PHANTOM_CACHE) != KERN_SUCCESS) {
 138                 panic("vm_phantom_cache_init: kernel_memory_allocate failed\n");
 139         }
 140         bzero(vm_phantom_cache, vm_phantom_cache_size);
 141
 142         if (kernel_memory_allocate(kernel_map, (vm_offset_t *)(&vm_phantom_cache_hash), vm_phantom_cache_hash_size, 0, KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_PHANTOM_CACHE) != KERN_SUCCESS) {
 143                 panic("vm_phantom_cache_init: kernel_memory_allocate failed\n");
 144         }
 145         bzero(vm_phantom_cache_hash, vm_phantom_cache_hash_size);
 146
 147
 148         vm_ghost_hash_mask = vm_phantom_cache_num_entries - 1;
 149
 150         /*
 151          *      Calculate object_id shift value for hashing algorithm:
 152          *              O = log2(sizeof(struct vm_object))
 153          *              B = log2(vm_page_bucket_count)
 154          *              hash shifts the object_id left by
 155          *              B/2 - O
 156          */
 157         size = vm_phantom_cache_num_entries;
 158         for (log1 = 0; size > 1; log1++) {
 159                 size /= 2;
 160         }
 161
 162         vm_ghost_bucket_hash = 1 << ((log1 + 1) >> 1);          /* Get (ceiling of sqrt of table size) */
 163         vm_ghost_bucket_hash |= 1 << ((log1 + 1) >> 2);         /* Get (ceiling of quadroot of table size) */
 164         vm_ghost_bucket_hash |= 1;                              /* Set bit and add 1 - always must be 1 to insure unique series */
 165
 166         if (vm_ghost_hash_mask & vm_phantom_cache_num_entries) {
 167                 printf("vm_phantom_cache_init: WARNING -- strange page hash\n");
 168         }
 169 }
 170
 171
 172 void
 173 vm_phantom_cache_add_ghost(vm_page_t m)
 174 {
 175         vm_ghost_t      vpce;
 176         vm_object_t     object;
 177         int             ghost_index;
 178         int             pg_mask;
 179         boolean_t       isSSD = FALSE;
 180         vm_phantom_hash_entry_t ghost_hash_index;
 181
 182         object = VM_PAGE_OBJECT(m);
 183
 184         LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
 185         vm_object_lock_assert_exclusive(object);
 186
 187         if (vm_phantom_cache_num_entries == 0) {
 188                 return;
 189         }
 190
 191         pg_mask = pg_masks[(m->vmp_offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK];
 192
 193         if (object->phantom_object_id == 0) {
 194                 vnode_pager_get_isSSD(object->pager, &isSSD);
 195
 196                 if (isSSD == TRUE) {
 197                         object->phantom_isssd = TRUE;
 198                 }
 199
 200                 object->phantom_object_id = vm_phantom_object_id++;
 201
 202                 if (vm_phantom_object_id == 0) {
 203                         vm_phantom_object_id = VM_PHANTOM_OBJECT_ID_AFTER_WRAP;
 204                 }
 205         } else {
 206                 if ((vpce = vm_phantom_cache_lookup_ghost(m, 0))) {
 207                         vpce->g_pages_held |= pg_mask;
 208
 209                         phantom_cache_stats.pcs_added_page_to_entry++;
 210                         goto done;
 211                 }
 212         }
 213         /*
 214          * if we're here then the vm_ghost_t of this vm_page_t
 215          * is not present in the phantom cache... take the next
 216          * available entry in the LRU first evicting the existing
 217          * entry if we've wrapped the ring
 218          */
 219         ghost_index = vm_phantom_cache_nindx++;
 220
 221         if (vm_phantom_cache_nindx == vm_phantom_cache_num_entries) {
 222                 vm_phantom_cache_nindx = 1;
 223
 224                 phantom_cache_stats.pcs_wrapped++;
 225         }
 226         vpce = &vm_phantom_cache[ghost_index];
 227
 228         if (vpce->g_obj_id) {
 229                 /*
 230                  * we're going to replace an existing entry
 231                  * so first remove it from the hash
 232                  */
 233                 vm_ghost_t      nvpce;
 234
 235                 ghost_hash_index = vm_phantom_hash(vpce->g_obj_id, vpce->g_obj_offset);
 236
 237                 nvpce = &vm_phantom_cache[vm_phantom_cache_hash[ghost_hash_index]];
 238
 239                 if (nvpce == vpce) {
 240                         vm_phantom_cache_hash[ghost_hash_index] = vpce->g_next_index;
 241                 } else {
 242                         for (;;) {
 243                                 if (nvpce->g_next_index == 0) {
 244                                         panic("didn't find ghost in hash\n");
 245                                 }
 246
 247                                 if (&vm_phantom_cache[nvpce->g_next_index] == vpce) {
 248                                         nvpce->g_next_index = vpce->g_next_index;
 249                                         break;
 250                                 }
 251                                 nvpce = &vm_phantom_cache[nvpce->g_next_index];
 252                         }
 253                 }
 254                 phantom_cache_stats.pcs_replaced_entry++;
 255         } else {
 256                 phantom_cache_stats.pcs_added_new_entry++;
 257         }
 258
 259         vpce->g_pages_held = pg_mask;
 260         vpce->g_obj_offset = (m->vmp_offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK;
 261         vpce->g_obj_id = object->phantom_object_id;
 262
 263         ghost_hash_index = vm_phantom_hash(vpce->g_obj_id, vpce->g_obj_offset);
 264         vpce->g_next_index = vm_phantom_cache_hash[ghost_hash_index];
 265         vm_phantom_cache_hash[ghost_hash_index] = ghost_index;
 266
 267 done:
 268         vm_pageout_vminfo.vm_phantom_cache_added_ghost++;
 269
 270         if (object->phantom_isssd) {
 271                 OSAddAtomic(1, &sample_period_ghost_added_count_ssd);
 272         } else {
 273                 OSAddAtomic(1, &sample_period_ghost_added_count);
 274         }
 275 }
 276
 277
 278 vm_ghost_t
 279 vm_phantom_cache_lookup_ghost(vm_page_t m, uint32_t pg_mask)
 280 {
 281         uint64_t        g_obj_offset;
 282         uint32_t        g_obj_id;
 283         uint32_t        ghost_index;
 284         vm_object_t     object;
 285
 286         object = VM_PAGE_OBJECT(m);
 287
 288         if ((g_obj_id = object->phantom_object_id) == 0) {
 289                 /*
 290                  * no entries in phantom cache for this object
 291                  */
 292                 return NULL;
 293         }
 294         g_obj_offset = (m->vmp_offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK;
 295
 296         ghost_index = vm_phantom_cache_hash[vm_phantom_hash(g_obj_id, g_obj_offset)];
 297
 298         while (ghost_index) {
 299                 vm_ghost_t      vpce;
 300
 301                 vpce = &vm_phantom_cache[ghost_index];
 302
 303                 if (vpce->g_obj_id == g_obj_id && vpce->g_obj_offset == g_obj_offset) {
 304                         if (pg_mask == 0 || (vpce->g_pages_held & pg_mask)) {
 305                                 phantom_cache_stats.pcs_lookup_found_page_in_cache++;
 306
 307                                 return vpce;
 308                         }
 309                         phantom_cache_stats.pcs_lookup_page_not_in_entry++;
 310
 311                         return NULL;
 312                 }
 313                 ghost_index = vpce->g_next_index;
 314         }
 315         phantom_cache_stats.pcs_lookup_entry_not_in_cache++;
 316
 317         return NULL;
 318 }
 319
 320
 321
 322 void
 323 vm_phantom_cache_update(vm_page_t m)
 324 {
 325         int             pg_mask;
 326         vm_ghost_t      vpce;
 327         vm_object_t     object;
 328
 329         object = VM_PAGE_OBJECT(m);
 330
 331         LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
 332         vm_object_lock_assert_exclusive(object);
 333
 334         if (vm_phantom_cache_num_entries == 0) {
 335                 return;
 336         }
 337
 338         pg_mask = pg_masks[(m->vmp_offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK];
 339
 340         if ((vpce = vm_phantom_cache_lookup_ghost(m, pg_mask))) {
 341                 vpce->g_pages_held &= ~pg_mask;
 342
 343                 phantom_cache_stats.pcs_updated_phantom_state++;
 344                 vm_pageout_vminfo.vm_phantom_cache_found_ghost++;
 345
 346                 if (object->phantom_isssd) {
 347                         OSAddAtomic(1, &sample_period_ghost_found_count_ssd);
 348                 } else {
 349                         OSAddAtomic(1, &sample_period_ghost_found_count);
 350                 }
 351         }
 352 }
 353
 354
 355 #define PHANTOM_CACHE_DEBUG     1
 356
 357 #if     PHANTOM_CACHE_DEBUG
 358
 359 int     sample_period_ghost_counts_indx = 0;
 360
 361 struct {
 362         uint32_t        added;
 363         uint32_t        found;
 364         uint32_t        added_ssd;
 365         uint32_t        found_ssd;
 366         uint32_t        elapsed_ms;
 367         boolean_t       pressure_detected;
 368 } sample_period_ghost_counts[256];
 369
 370 #endif
 371
 372 /*
 373  * Determine if the file cache is thrashing from sampling interval statistics.
 374  *
 375  * Pages added to the phantom cache = pages evicted from the file cache.
 376  * Pages found in the phantom cache = reads of pages that were recently evicted.
 377  * Threshold is the latency-dependent number of reads we consider thrashing.
 378  */
 379 static boolean_t
 380 is_thrashing(uint32_t added, uint32_t found, uint32_t threshold)
 381 {
 382         /* Ignore normal activity below the threshold. */
 383         if (added < threshold || found < threshold) {
 384                 return FALSE;
 385         }
 386
 387         /*
 388          * When thrashing in a way that we can mitigate, most of the pages read
 389          * into the file cache were recently evicted, and 'found' will be close
 390          * to 'added'.
 391          *
 392          * When replacing the current working set because a new app is
 393          * launched, we see very high read traffic with sporadic phantom cache
 394          * hits.
 395          *
 396          * This is not thrashing, or freeing up memory wouldn't help much
 397          * anyway.
 398          */
 399         if (found < added / 2) {
 400                 return FALSE;
 401         }
 402
 403         return TRUE;
 404 }
 405
 406 /*
 407  * the following function is never called
 408  * from multiple threads simultaneously due
 409  * to a condition variable used to serialize
 410  * at the compressor level... thus no need
 411  * to provide locking for the sample processing
 412  */
 413 boolean_t
 414 vm_phantom_cache_check_pressure()
 415 {
 416         clock_sec_t     cur_ts_sec;
 417         clock_nsec_t    cur_ts_nsec;
 418         uint64_t        elapsed_msecs_in_eval;
 419         boolean_t       pressure_detected = FALSE;
 420
 421         clock_get_system_nanotime(&cur_ts_sec, &cur_ts_nsec);
 422
 423         elapsed_msecs_in_eval = vm_compressor_compute_elapsed_msecs(cur_ts_sec, cur_ts_nsec, pc_start_of_eval_period_sec, pc_start_of_eval_period_nsec);
 424
 425         /*
 426          * Reset evaluation period after phantom_cache_eval_period_in_msecs or
 427          * whenever vm_phantom_cache_restart_sample has been called.
 428          */
 429         if (elapsed_msecs_in_eval >= phantom_cache_eval_period_in_msecs) {
 430                 pc_need_eval_reset = TRUE;
 431         }
 432
 433         if (pc_need_eval_reset == TRUE) {
 434 #if PHANTOM_CACHE_DEBUG
 435                 /*
 436                  * maintain some info about the last 256 sample periods
 437                  */
 438                 sample_period_ghost_counts[sample_period_ghost_counts_indx].added = sample_period_ghost_added_count;
 439                 sample_period_ghost_counts[sample_period_ghost_counts_indx].found = sample_period_ghost_found_count;
 440                 sample_period_ghost_counts[sample_period_ghost_counts_indx].added_ssd = sample_period_ghost_added_count_ssd;
 441                 sample_period_ghost_counts[sample_period_ghost_counts_indx].found_ssd = sample_period_ghost_found_count_ssd;
 442                 sample_period_ghost_counts[sample_period_ghost_counts_indx].elapsed_ms = (uint32_t)elapsed_msecs_in_eval;
 443
 444                 sample_period_ghost_counts_indx++;
 445
 446                 if (sample_period_ghost_counts_indx >= 256) {
 447                         sample_period_ghost_counts_indx = 0;
 448                 }
 449 #endif
 450                 sample_period_ghost_added_count = 0;
 451                 sample_period_ghost_found_count = 0;
 452                 sample_period_ghost_added_count_ssd = 0;
 453                 sample_period_ghost_found_count_ssd = 0;
 454
 455                 pc_start_of_eval_period_sec = cur_ts_sec;
 456                 pc_start_of_eval_period_nsec = cur_ts_nsec;
 457                 pc_history <<= 1;
 458                 pc_need_eval_reset = FALSE;
 459         } else {
 460                 /*
 461                  * Since the trashing rate is really a function of the read latency of the disk
 462                  * we have to consider both the SSD and spinning disk case since the file cache
 463                  * could be backed by either or even both flavors.  When the object is first
 464                  * assigned a phantom_object_id, we query the pager to determine if the backing
 465                  * backing media is an SSD and remember that answer in the vm_object.  We use
 466                  * that info to maintains counts for both the SSD and spinning disk cases.
 467                  */
 468                 if (is_thrashing(sample_period_ghost_added_count,
 469                     sample_period_ghost_found_count,
 470                     phantom_cache_thrashing_threshold) ||
 471                     is_thrashing(sample_period_ghost_added_count_ssd,
 472                     sample_period_ghost_found_count_ssd,
 473                     phantom_cache_thrashing_threshold_ssd)) {
 474                         /* Thrashing in the current period: Set bit 0. */
 475                         pc_history |= 1;
 476                 }
 477         }
 478
 479         /*
 480          * Declare pressure_detected after phantom_cache_contiguous_periods.
 481          *
 482          * Create a bitmask with the N low bits set. These bits must all be set
 483          * in pc_history. The high bits of pc_history are ignored.
 484          */
 485         uint32_t bitmask = (1u << phantom_cache_contiguous_periods) - 1;
 486         if ((pc_history & bitmask) == bitmask) {
 487                 pressure_detected = TRUE;
 488         }
 489
 490         if (vm_page_external_count > ((AVAILABLE_MEMORY) * 50) / 100) {
 491                 pressure_detected = FALSE;
 492         }
 493
 494 #if PHANTOM_CACHE_DEBUG
 495         sample_period_ghost_counts[sample_period_ghost_counts_indx].pressure_detected = pressure_detected;
 496 #endif
 497         return pressure_detected;
 498 }
 499
 500 /*
 501  * Restart the current sampling because conditions have changed significantly,
 502  * and we don't want to react to old data.
 503  *
 504  * This function can be called from any thread.
 505  */
 506 void
 507 vm_phantom_cache_restart_sample(void)
 508 {
 509         pc_need_eval_reset = TRUE;
 510 }