bsd/kern/kern_memorystatus_notify.c

   1 /*
   2  * Copyright (c) 2006-2018 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  *
  28  */
  29
  30 #include <sys/kern_event.h>
  31 #include <kern/sched_prim.h>
  32 #include <kern/assert.h>
  33 #include <kern/debug.h>
  34 #include <kern/locks.h>
  35 #include <kern/task.h>
  36 #include <kern/thread.h>
  37 #include <kern/host.h>
  38 #include <kern/policy_internal.h>
  39 #include <kern/thread_group.h>
  40
  41 #include <IOKit/IOBSD.h>
  42
  43 #include <libkern/libkern.h>
  44 #include <mach/coalition.h>
  45 #include <mach/mach_time.h>
  46 #include <mach/task.h>
  47 #include <mach/host_priv.h>
  48 #include <mach/mach_host.h>
  49 #include <os/log.h>
  50 #include <pexpert/pexpert.h>
  51 #include <sys/coalition.h>
  52 #include <sys/kern_event.h>
  53 #include <sys/proc.h>
  54 #include <sys/proc_info.h>
  55 #include <sys/reason.h>
  56 #include <sys/signal.h>
  57 #include <sys/signalvar.h>
  58 #include <sys/sysctl.h>
  59 #include <sys/sysproto.h>
  60 #include <sys/time.h>
  61 #include <sys/wait.h>
  62 #include <sys/tree.h>
  63 #include <sys/priv.h>
  64 #include <vm/vm_pageout.h>
  65 #include <vm/vm_protos.h>
  66 #include <mach/machine/sdt.h>
  67 #include <libkern/section_keywords.h>
  68 #include <stdatomic.h>
  69
  70 #if CONFIG_FREEZE
  71 #include <vm/vm_map.h>
  72 #endif /* CONFIG_FREEZE */
  73
  74 #include <sys/kern_memorystatus.h>
  75 #include <sys/kern_memorystatus_notify.h>
  76
  77 /*
  78  * Memorystatus klist structures
  79  */
  80 struct klist memorystatus_klist;
  81 static lck_mtx_t memorystatus_klist_mutex;
  82 static void memorystatus_klist_lock(void);
  83 static void memorystatus_klist_unlock(void);
  84
  85 /*
  86  * Memorystatus kevent filter routines
  87  */
  88 static int filt_memorystatusattach(struct knote *kn, struct kevent_qos_s *kev);
  89 static void filt_memorystatusdetach(struct knote *kn);
  90 static int filt_memorystatus(struct knote *kn, long hint);
  91 static int filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev);
  92 static int filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev);
  93
  94 SECURITY_READ_ONLY_EARLY(struct filterops) memorystatus_filtops = {
  95         .f_attach = filt_memorystatusattach,
  96         .f_detach = filt_memorystatusdetach,
  97         .f_event = filt_memorystatus,
  98         .f_touch = filt_memorystatustouch,
  99         .f_process = filt_memorystatusprocess,
 100 };
 101
 102 /*
 103  * Memorystatus notification events
 104  */
 105 enum {
 106         kMemorystatusNoPressure = 0x1,
 107         kMemorystatusPressure = 0x2,
 108         kMemorystatusLowSwap = 0x4,
 109         kMemorystatusProcLimitWarn = 0x8,
 110         kMemorystatusProcLimitCritical = 0x10
 111 };
 112
 113 #define INTER_NOTIFICATION_DELAY    (250000)    /* .25 second */
 114 #define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD        5000    /* milliseconds */
 115 #define WARNING_NOTIFICATION_RESTING_PERIOD        25    /* seconds */
 116 #define CRITICAL_NOTIFICATION_RESTING_PERIOD        25    /* seconds */
 117
 118 /*
 119  * Memorystatus notification helper routines
 120  */
 121 static vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
 122 static boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
 123 static void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
 124 static struct knote *vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process);
 125 static void vm_dispatch_memory_pressure(void);
 126 kern_return_t memorystatus_update_vm_pressure(boolean_t target_foreground_process);
 127
 128 #if VM_PRESSURE_EVENTS
 129
 130 /*
 131  * This value is the threshold that a process must meet to be considered for scavenging.
 132  */
 133 #if XNU_TARGET_OS_OSX
 134 #define VM_PRESSURE_MINIMUM_RSIZE        10    /* MB */
 135 #else /* XNU_TARGET_OS_OSX */
 136 #define VM_PRESSURE_MINIMUM_RSIZE        6    /* MB */
 137 #endif /* XNU_TARGET_OS_OSX */
 138
 139 static uint32_t vm_pressure_task_footprint_min = VM_PRESSURE_MINIMUM_RSIZE;
 140
 141 #if DEVELOPMENT || DEBUG
 142 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_task_footprint_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_task_footprint_min, 0, "");
 143 #endif /* DEVELOPMENT || DEBUG */
 144
 145 vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal;
 146
 147 /*
 148  * We use this flag to signal if we have any HWM offenders
 149  * on the system. This way we can reduce the number of wakeups
 150  * of the memorystatus_thread when the system is between the
 151  * "pressure" and "critical" threshold.
 152  *
 153  * The (re-)setting of this variable is done without any locks
 154  * or synchronization simply because it is not possible (currently)
 155  * to keep track of HWM offenders that drop down below their memory
 156  * limit and/or exit. So, we choose to burn a couple of wasted wakeups
 157  * by allowing the unguarded modification of this variable.
 158  */
 159 boolean_t memorystatus_hwm_candidates = 0;
 160
 161 #endif /* VM_PRESSURE_EVENTS */
 162
 163 #if CONFIG_JETSAM
 164
 165 extern unsigned int memorystatus_available_pages;
 166 extern unsigned int memorystatus_available_pages_pressure;
 167 extern unsigned int memorystatus_available_pages_critical;
 168 extern unsigned int memorystatus_available_pages_critical_base;
 169 extern unsigned int memorystatus_available_pages_critical_idle_offset;
 170
 171 #else /* CONFIG_JETSAM */
 172
 173 extern uint64_t memorystatus_available_pages;
 174 extern uint64_t memorystatus_available_pages_pressure;
 175 extern uint64_t memorystatus_available_pages_critical;
 176
 177 #endif /* CONFIG_JETSAM */
 178
 179 extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
 180 uint32_t memorystatus_jetsam_fg_band_waiters = 0;
 181 static uint64_t memorystatus_jetsam_fg_band_timestamp_ns = 0; /* nanosec */
 182 static uint64_t memorystatus_jetsam_fg_band_delay_ns = 5ull * 1000 * 1000 * 1000; /* nanosec */
 183
 184 extern boolean_t(*volatile consider_buffer_cache_collect)(int);
 185
 186 #if DEVELOPMENT || DEBUG
 187 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_jetsam_fg_band_delay_ns, CTLFLAG_RW | CTLFLAG_LOCKED,
 188     &memorystatus_jetsam_fg_band_delay_ns, "");
 189 #endif
 190
 191 static int
 192 filt_memorystatusattach(struct knote *kn, __unused struct kevent_qos_s *kev)
 193 {
 194         int error;
 195
 196         kn->kn_flags |= EV_CLEAR; /* automatically set */
 197         kn->kn_sdata = 0;         /* incoming data is ignored */
 198
 199         error = memorystatus_knote_register(kn);
 200         if (error) {
 201                 knote_set_error(kn, error);
 202         }
 203         return 0;
 204 }
 205
 206 static void
 207 filt_memorystatusdetach(struct knote *kn)
 208 {
 209         memorystatus_knote_unregister(kn);
 210 }
 211
 212 static int
 213 filt_memorystatus(struct knote *kn __unused, long hint)
 214 {
 215         if (hint) {
 216                 switch (hint) {
 217                 case kMemorystatusNoPressure:
 218                         if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
 219                                 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
 220                         }
 221                         break;
 222                 case kMemorystatusPressure:
 223                         if (memorystatus_vm_pressure_level == kVMPressureWarning || memorystatus_vm_pressure_level == kVMPressureUrgent) {
 224                                 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
 225                                         kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
 226                                 }
 227                         } else if (memorystatus_vm_pressure_level == kVMPressureCritical) {
 228                                 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
 229                                         kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
 230                                 }
 231                         }
 232                         break;
 233                 case kMemorystatusLowSwap:
 234                         if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) {
 235                                 kn->kn_fflags = NOTE_MEMORYSTATUS_LOW_SWAP;
 236                         }
 237                         break;
 238
 239                 case kMemorystatusProcLimitWarn:
 240                         if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
 241                                 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
 242                         }
 243                         break;
 244
 245                 case kMemorystatusProcLimitCritical:
 246                         if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
 247                                 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
 248                         }
 249                         break;
 250
 251                 default:
 252                         break;
 253                 }
 254         }
 255
 256 #if 0
 257         if (kn->kn_fflags != 0) {
 258                 proc_t knote_proc = knote_get_kq(kn)->kq_p;
 259                 pid_t knote_pid = knote_proc->p_pid;
 260
 261                 printf("filt_memorystatus: sending kn 0x%lx (event 0x%x) for pid (%d)\n",
 262                     (unsigned long)kn, kn->kn_fflags, knote_pid);
 263         }
 264 #endif
 265
 266         return kn->kn_fflags != 0;
 267 }
 268
 269 static int
 270 filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev)
 271 {
 272         int res;
 273         int prev_kn_sfflags = 0;
 274
 275         memorystatus_klist_lock();
 276
 277         /*
 278          * copy in new kevent settings
 279          * (saving the "desired" data and fflags).
 280          */
 281
 282         prev_kn_sfflags = kn->kn_sfflags;
 283         kn->kn_sfflags = (kev->fflags & EVFILT_MEMORYSTATUS_ALL_MASK);
 284
 285 #if XNU_TARGET_OS_OSX
 286         /*
 287          * Only on desktop do we restrict notifications to
 288          * one per active/inactive state (soft limits only).
 289          */
 290         if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
 291                 /*
 292                  * Is there previous state to preserve?
 293                  */
 294                 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
 295                         /*
 296                          * This knote was previously interested in proc_limit_warn,
 297                          * so yes, preserve previous state.
 298                          */
 299                         if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
 300                                 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
 301                         }
 302                         if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
 303                                 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
 304                         }
 305                 } else {
 306                         /*
 307                          * This knote was not previously interested in proc_limit_warn,
 308                          * but it is now.  Set both states.
 309                          */
 310                         kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
 311                         kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
 312                 }
 313         }
 314
 315         if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
 316                 /*
 317                  * Is there previous state to preserve?
 318                  */
 319                 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
 320                         /*
 321                          * This knote was previously interested in proc_limit_critical,
 322                          * so yes, preserve previous state.
 323                          */
 324                         if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
 325                                 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
 326                         }
 327                         if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
 328                                 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
 329                         }
 330                 } else {
 331                         /*
 332                          * This knote was not previously interested in proc_limit_critical,
 333                          * but it is now.  Set both states.
 334                          */
 335                         kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
 336                         kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
 337                 }
 338         }
 339 #endif /* XNU_TARGET_OS_OSX */
 340
 341         /*
 342          * reset the output flags based on a
 343          * combination of the old events and
 344          * the new desired event list.
 345          */
 346         //kn->kn_fflags &= kn->kn_sfflags;
 347
 348         res = (kn->kn_fflags != 0);
 349
 350         memorystatus_klist_unlock();
 351
 352         return res;
 353 }
 354
 355 static int
 356 filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev)
 357 {
 358         int res = 0;
 359
 360         memorystatus_klist_lock();
 361         if (kn->kn_fflags) {
 362                 knote_fill_kevent(kn, kev, 0);
 363                 res = 1;
 364         }
 365         memorystatus_klist_unlock();
 366
 367         return res;
 368 }
 369
 370 static void
 371 memorystatus_klist_lock(void)
 372 {
 373         lck_mtx_lock(&memorystatus_klist_mutex);
 374 }
 375
 376 static void
 377 memorystatus_klist_unlock(void)
 378 {
 379         lck_mtx_unlock(&memorystatus_klist_mutex);
 380 }
 381
 382 void
 383 memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr)
 384 {
 385         lck_mtx_init(&memorystatus_klist_mutex, grp, attr);
 386         klist_init(&memorystatus_klist);
 387 }
 388
 389 int
 390 memorystatus_knote_register(struct knote *kn)
 391 {
 392         int error = 0;
 393
 394         memorystatus_klist_lock();
 395
 396         /*
 397          * Support only userspace visible flags.
 398          */
 399         if ((kn->kn_sfflags & EVFILT_MEMORYSTATUS_ALL_MASK) == (unsigned int) kn->kn_sfflags) {
 400 #if XNU_TARGET_OS_OSX
 401                 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
 402                         kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
 403                         kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
 404                 }
 405
 406                 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
 407                         kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
 408                         kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
 409                 }
 410 #endif /* XNU_TARGET_OS_OSX */
 411
 412                 KNOTE_ATTACH(&memorystatus_klist, kn);
 413         } else {
 414                 error = ENOTSUP;
 415         }
 416
 417         memorystatus_klist_unlock();
 418
 419         return error;
 420 }
 421
 422 void
 423 memorystatus_knote_unregister(struct knote *kn __unused)
 424 {
 425         memorystatus_klist_lock();
 426         KNOTE_DETACH(&memorystatus_klist, kn);
 427         memorystatus_klist_unlock();
 428 }
 429
 430 #if VM_PRESSURE_EVENTS
 431
 432 #if CONFIG_MEMORYSTATUS
 433
 434 static inline int
 435 memorystatus_send_note_internal(int event_code, int subclass, void *data, uint32_t data_length)
 436 {
 437         int ret;
 438         struct kev_msg ev_msg;
 439
 440         ev_msg.vendor_code    = KEV_VENDOR_APPLE;
 441         ev_msg.kev_class      = KEV_SYSTEM_CLASS;
 442         ev_msg.kev_subclass   = subclass;
 443
 444         ev_msg.event_code     = event_code;
 445
 446         ev_msg.dv[0].data_length = data_length;
 447         ev_msg.dv[0].data_ptr = data;
 448         ev_msg.dv[1].data_length = 0;
 449
 450         ret = kev_post_msg(&ev_msg);
 451         if (ret) {
 452                 printf("%s: kev_post_msg() failed, err %d\n", __func__, ret);
 453         }
 454
 455         return ret;
 456 }
 457
 458 int
 459 memorystatus_send_note(int event_code, void *data, uint32_t data_length)
 460 {
 461         return memorystatus_send_note_internal(event_code, KEV_MEMORYSTATUS_SUBCLASS, data, data_length);
 462 }
 463
 464 int
 465 memorystatus_send_dirty_status_change_note(void *data, uint32_t data_length)
 466 {
 467         return memorystatus_send_note_internal(kDirtyStatusChangeNote, KEV_DIRTYSTATUS_SUBCLASS, data, data_length);
 468 }
 469
 470 boolean_t
 471 memorystatus_warn_process(const proc_t p, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t limit_exceeded)
 472 {
 473         /*
 474          * This function doesn't take a reference to p or lock it. So it better be the current process.
 475          */
 476         assert(p == current_proc());
 477         pid_t pid = p->p_pid;
 478         boolean_t ret = FALSE;
 479         boolean_t found_knote = FALSE;
 480         struct knote *kn = NULL;
 481         int send_knote_count = 0;
 482         uint32_t platform;
 483         platform = proc_platform(p);
 484
 485         /*
 486          * See comment in sysctl_memorystatus_vm_pressure_send.
 487          */
 488
 489         memorystatus_klist_lock();
 490
 491         SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
 492                 proc_t knote_proc = knote_get_kq(kn)->kq_p;
 493                 pid_t knote_pid = knote_proc->p_pid;
 494
 495                 if (knote_pid == pid) {
 496                         /*
 497                          * By setting the "fflags" here, we are forcing
 498                          * a process to deal with the case where it's
 499                          * bumping up into its memory limits. If we don't
 500                          * do this here, we will end up depending on the
 501                          * system pressure snapshot evaluation in
 502                          * filt_memorystatus().
 503                          */
 504
 505                         /*
 506                          * The type of notification and the frequency are different between
 507                          * embedded and desktop.
 508                          *
 509                          * Embedded processes register for global pressure notifications
 510                          * (NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) via UIKit
 511                          * (see applicationDidReceiveMemoryWarning in UIKit). We'll warn them here if
 512                          * they are near there memory limit. filt_memorystatus() will warn them based
 513                          * on the system pressure level.
 514                          *
 515                          * On desktop, (NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL)
 516                          * are only expected to fire for system level warnings. Desktop procesess
 517                          * register for NOTE_MEMORYSTATUS_PROC_LIMIT_WARN
 518                          * if they want to be warned when they approach their limit
 519                          * and for NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL to be warned when they
 520                          * exceed their limit.
 521                          *
 522                          * On embedded we continuously warn processes that are approaching their
 523                          * memory limit. However on desktop, we only send one warning while
 524                          * the process is active/inactive if the limit is soft..
 525                          *
 526                          */
 527                         if (platform == PLATFORM_MACOS || platform == PLATFORM_MACCATALYST || platform == PLATFORM_DRIVERKIT) {
 528                                 if (!limit_exceeded) {
 529                                         if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
 530                                                 found_knote = TRUE;
 531                                                 if (!is_fatal) {
 532                                                         /*
 533                                                          * Restrict proc_limit_warn notifications when
 534                                                          * non-fatal (soft) limit is at play.
 535                                                          */
 536                                                         if (is_active) {
 537                                                                 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
 538                                                                         /*
 539                                                                          * Mark this knote for delivery.
 540                                                                          */
 541                                                                         kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
 542                                                                         /*
 543                                                                          * And suppress it from future notifications.
 544                                                                          */
 545                                                                         kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
 546                                                                         send_knote_count++;
 547                                                                 }
 548                                                         } else {
 549                                                                 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
 550                                                                         /*
 551                                                                          * Mark this knote for delivery.
 552                                                                          */
 553                                                                         kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
 554                                                                         /*
 555                                                                          * And suppress it from future notifications.
 556                                                                          */
 557                                                                         kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
 558                                                                         send_knote_count++;
 559                                                                 }
 560                                                         }
 561                                                 } else {
 562                                                         /*
 563                                                          * No restriction on proc_limit_warn notifications when
 564                                                          * fatal (hard) limit is at play.
 565                                                          */
 566                                                         kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
 567                                                         send_knote_count++;
 568                                                 }
 569                                         }
 570                                 } else {
 571                                         /*
 572                                          * Send this notification when a process has exceeded a soft limit,
 573                                          */
 574
 575                                         if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
 576                                                 found_knote = TRUE;
 577                                                 if (!is_fatal) {
 578                                                         /*
 579                                                          * Restrict critical notifications for soft limits.
 580                                                          */
 581
 582                                                         if (is_active) {
 583                                                                 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
 584                                                                         /*
 585                                                                          * Suppress future proc_limit_critical notifications
 586                                                                          * for the active soft limit.
 587                                                                          */
 588                                                                         kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
 589                                                                         kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
 590                                                                         send_knote_count++;
 591                                                                 }
 592                                                         } else {
 593                                                                 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
 594                                                                         /*
 595                                                                          * Suppress future proc_limit_critical_notifications
 596                                                                          * for the inactive soft limit.
 597                                                                          */
 598                                                                         kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
 599                                                                         kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
 600                                                                         send_knote_count++;
 601                                                                 }
 602                                                         }
 603                                                 } else {
 604                                                         /*
 605                                                          * We should never be trying to send a critical notification for
 606                                                          * a hard limit... the process would be killed before it could be
 607                                                          * received.
 608                                                          */
 609                                                         panic("Caught sending pid %d a critical warning for a fatal limit.\n", pid);
 610                                                 }
 611                                         }
 612                                 }
 613                         } else {
 614                                 if (!limit_exceeded) {
 615                                         /*
 616                                          * Intentionally set either the unambiguous limit warning,
 617                                          * the system-wide critical or the system-wide warning
 618                                          * notification bit.
 619                                          */
 620
 621                                         if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
 622                                                 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
 623                                                 found_knote = TRUE;
 624                                                 send_knote_count++;
 625                                         } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
 626                                                 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
 627                                                 found_knote = TRUE;
 628                                                 send_knote_count++;
 629                                         } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
 630                                                 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
 631                                                 found_knote = TRUE;
 632                                                 send_knote_count++;
 633                                         }
 634                                 } else {
 635                                         /*
 636                                          * Send this notification when a process has exceeded a soft limit.
 637                                          */
 638                                         if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
 639                                                 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
 640                                                 found_knote = TRUE;
 641                                                 send_knote_count++;
 642                                         }
 643                                 }
 644                         }
 645                 }
 646         }
 647
 648         if (found_knote) {
 649                 if (send_knote_count > 0) {
 650                         KNOTE(&memorystatus_klist, 0);
 651                 }
 652                 ret = TRUE;
 653         }
 654
 655         memorystatus_klist_unlock();
 656
 657         return ret;
 658 }
 659
 660 /*
 661  * Can only be set by the current task on itself.
 662  */
 663 int
 664 memorystatus_low_mem_privileged_listener(uint32_t op_flags)
 665 {
 666         boolean_t set_privilege = FALSE;
 667         /*
 668          * Need an entitlement check here?
 669          */
 670         if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE) {
 671                 set_privilege = TRUE;
 672         } else if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE) {
 673                 set_privilege = FALSE;
 674         } else {
 675                 return EINVAL;
 676         }
 677
 678         return task_low_mem_privileged_listener(current_task(), set_privilege, NULL);
 679 }
 680
 681 int
 682 memorystatus_send_pressure_note(pid_t pid)
 683 {
 684         MEMORYSTATUS_DEBUG(1, "memorystatus_send_pressure_note(): pid %d\n", pid);
 685         return memorystatus_send_note(kMemorystatusPressureNote, &pid, sizeof(pid));
 686 }
 687
 688 boolean_t
 689 memorystatus_is_foreground_locked(proc_t p)
 690 {
 691         return (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) ||
 692                (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT);
 693 }
 694
 695 /*
 696  * This is meant for stackshot and kperf -- it does not take the proc_list_lock
 697  * to access the p_memstat_dirty field.
 698  */
 699 void
 700 memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit)
 701 {
 702         if (!v) {
 703                 *is_dirty = FALSE;
 704                 *is_dirty_tracked = FALSE;
 705                 *allow_idle_exit = FALSE;
 706         } else {
 707                 proc_t p = (proc_t)v;
 708                 *is_dirty = (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0;
 709                 *is_dirty_tracked = (p->p_memstat_dirty & P_DIRTY_TRACK) != 0;
 710                 *allow_idle_exit = (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) != 0;
 711         }
 712 }
 713
 714 boolean_t
 715 memorystatus_bg_pressure_eligible(proc_t p)
 716 {
 717         boolean_t eligible = FALSE;
 718
 719         proc_list_lock();
 720
 721         MEMORYSTATUS_DEBUG(1, "memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", p->p_pid, p->p_memstat_state);
 722
 723         /* Foreground processes have already been dealt with at this point, so just test for eligibility */
 724         if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) {
 725                 eligible = TRUE;
 726         }
 727
 728         if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_BACKGROUND_OPPORTUNISTIC) {
 729                 /*
 730                  * IDLE and IDLE_DEFERRED bands contain processes
 731                  * that have dropped memory to be under their inactive
 732                  * memory limits. And so they can't really give back
 733                  * anything.
 734                  */
 735                 eligible = FALSE;
 736         }
 737
 738         proc_list_unlock();
 739
 740         return eligible;
 741 }
 742
 743 void
 744 memorystatus_send_low_swap_note(void)
 745 {
 746         struct knote *kn = NULL;
 747
 748         memorystatus_klist_lock();
 749         SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
 750                 /* We call is_knote_registered_modify_task_pressure_bits to check if the sfflags for the
 751                  * current note contain NOTE_MEMORYSTATUS_LOW_SWAP. Once we find one note in the memorystatus_klist
 752                  * that has the NOTE_MEMORYSTATUS_LOW_SWAP flags in its sfflags set, we call KNOTE with
 753                  * kMemoryStatusLowSwap as the hint to process and update all knotes on the memorystatus_klist accordingly. */
 754                 if (is_knote_registered_modify_task_pressure_bits(kn, NOTE_MEMORYSTATUS_LOW_SWAP, NULL, 0, 0) == TRUE) {
 755                         KNOTE(&memorystatus_klist, kMemorystatusLowSwap);
 756                         break;
 757                 }
 758         }
 759
 760         memorystatus_klist_unlock();
 761 }
 762
 763 #endif /* CONFIG_MEMORYSTATUS */
 764
 765 /*
 766  * kn_max - knote
 767  *
 768  * knote_pressure_level - to check if the knote is registered for this notification level.
 769  *
 770  * task    - task whose bits we'll be modifying
 771  *
 772  * pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again.
 773  *
 774  * pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately.
 775  *
 776  */
 777
 778 static boolean_t
 779 is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set)
 780 {
 781         if (kn_max->kn_sfflags & knote_pressure_level) {
 782                 if (pressure_level_to_clear && task_has_been_notified(task, pressure_level_to_clear) == TRUE) {
 783                         task_clear_has_been_notified(task, pressure_level_to_clear);
 784                 }
 785
 786                 task_mark_has_been_notified(task, pressure_level_to_set);
 787                 return TRUE;
 788         }
 789
 790         return FALSE;
 791 }
 792
 793 static void
 794 memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear)
 795 {
 796         struct knote *kn = NULL;
 797
 798         memorystatus_klist_lock();
 799         SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
 800                 proc_t            p = PROC_NULL;
 801                 struct task*        t = TASK_NULL;
 802
 803                 p = knote_get_kq(kn)->kq_p;
 804                 proc_list_lock();
 805                 if (p != proc_ref_locked(p)) {
 806                         p = PROC_NULL;
 807                         proc_list_unlock();
 808                         continue;
 809                 }
 810                 proc_list_unlock();
 811
 812                 t = (struct task *)(p->task);
 813
 814                 task_clear_has_been_notified(t, pressure_level_to_clear);
 815
 816                 proc_rele(p);
 817         }
 818
 819         memorystatus_klist_unlock();
 820 }
 821
 822 /*
 823  * Used by the vm_pressure_thread which is
 824  * signalled from within vm_pageout_scan().
 825  */
 826
 827 void
 828 consider_vm_pressure_events(void)
 829 {
 830         vm_dispatch_memory_pressure();
 831 }
 832
 833 static void
 834 vm_dispatch_memory_pressure(void)
 835 {
 836         memorystatus_update_vm_pressure(FALSE);
 837 }
 838
 839 static struct knote *
 840 vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process)
 841 {
 842         struct knote    *kn = NULL, *kn_max = NULL;
 843         uint64_t    resident_max = 0;/* MB */
 844         int        selected_task_importance = 0;
 845         static int    pressure_snapshot = -1;
 846         boolean_t    pressure_increase = FALSE;
 847
 848         if (pressure_snapshot == -1) {
 849                 /*
 850                  * Initial snapshot.
 851                  */
 852                 pressure_snapshot = level;
 853                 pressure_increase = TRUE;
 854         } else {
 855                 if (level && (level >= pressure_snapshot)) {
 856                         pressure_increase = TRUE;
 857                 } else {
 858                         pressure_increase = FALSE;
 859                 }
 860
 861                 pressure_snapshot = level;
 862         }
 863
 864         if (pressure_increase == TRUE) {
 865                 /*
 866                  * We'll start by considering the largest
 867                  * unimportant task in our list.
 868                  */
 869                 selected_task_importance = INT_MAX;
 870         } else {
 871                 /*
 872                  * We'll start by considering the largest
 873                  * important task in our list.
 874                  */
 875                 selected_task_importance = 0;
 876         }
 877
 878         SLIST_FOREACH(kn, candidate_list, kn_selnext) {
 879                 uint64_t        resident_size = 0;/* MB */
 880                 proc_t            p = PROC_NULL;
 881                 struct task*        t = TASK_NULL;
 882                 int            curr_task_importance = 0;
 883                 boolean_t        consider_knote = FALSE;
 884                 boolean_t        privileged_listener = FALSE;
 885
 886                 p = knote_get_kq(kn)->kq_p;
 887                 proc_list_lock();
 888                 if (p != proc_ref_locked(p)) {
 889                         p = PROC_NULL;
 890                         proc_list_unlock();
 891                         continue;
 892                 }
 893                 proc_list_unlock();
 894
 895 #if CONFIG_MEMORYSTATUS
 896                 if (target_foreground_process == TRUE && !memorystatus_is_foreground_locked(p)) {
 897                         /*
 898                          * Skip process not marked foreground.
 899                          */
 900                         proc_rele(p);
 901                         continue;
 902                 }
 903 #endif /* CONFIG_MEMORYSTATUS */
 904
 905                 t = (struct task *)(p->task);
 906
 907                 vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(level);
 908
 909                 if ((kn->kn_sfflags & dispatch_level) == 0) {
 910                         proc_rele(p);
 911                         continue;
 912                 }
 913
 914 #if CONFIG_MEMORYSTATUS
 915                 if (target_foreground_process == FALSE && !memorystatus_bg_pressure_eligible(p)) {
 916                         VM_PRESSURE_DEBUG(1, "[vm_pressure] skipping process %d\n", p->p_pid);
 917                         proc_rele(p);
 918                         continue;
 919                 }
 920 #endif /* CONFIG_MEMORYSTATUS */
 921
 922 #if XNU_TARGET_OS_OSX
 923                 curr_task_importance = task_importance_estimate(t);
 924 #else /* XNU_TARGET_OS_OSX */
 925                 curr_task_importance = p->p_memstat_effectivepriority;
 926 #endif /* XNU_TARGET_OS_OSX */
 927
 928                 /*
 929                  * Privileged listeners are only considered in the multi-level pressure scheme
 930                  * AND only if the pressure is increasing.
 931                  */
 932                 if (level > 0) {
 933                         if (task_has_been_notified(t, level) == FALSE) {
 934                                 /*
 935                                  * Is this a privileged listener?
 936                                  */
 937                                 if (task_low_mem_privileged_listener(t, FALSE, &privileged_listener) == 0) {
 938                                         if (privileged_listener) {
 939                                                 kn_max = kn;
 940                                                 proc_rele(p);
 941                                                 goto done_scanning;
 942                                         }
 943                                 }
 944                         } else {
 945                                 proc_rele(p);
 946                                 continue;
 947                         }
 948                 } else if (level == 0) {
 949                         /*
 950                          * Task wasn't notified when the pressure was increasing and so
 951                          * no need to notify it that the pressure is decreasing.
 952                          */
 953                         if ((task_has_been_notified(t, kVMPressureWarning) == FALSE) && (task_has_been_notified(t, kVMPressureCritical) == FALSE)) {
 954                                 proc_rele(p);
 955                                 continue;
 956                         }
 957                 }
 958
 959                 /*
 960                  * We don't want a small process to block large processes from
 961                  * being notified again. <rdar://problem/7955532>
 962                  */
 963                 resident_size = (get_task_phys_footprint(t)) / (1024 * 1024ULL); /* MB */
 964
 965                 if (resident_size >= vm_pressure_task_footprint_min) {
 966                         if (level > 0) {
 967                                 /*
 968                                  * Warning or Critical Pressure.
 969                                  */
 970                                 if (pressure_increase) {
 971                                         if ((curr_task_importance < selected_task_importance) ||
 972                                             ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
 973                                                 /*
 974                                                  * We have found a candidate process which is:
 975                                                  * a) at a lower importance than the current selected process
 976                                                  * OR
 977                                                  * b) has importance equal to that of the current selected process but is larger
 978                                                  */
 979
 980                                                 consider_knote = TRUE;
 981                                         }
 982                                 } else {
 983                                         if ((curr_task_importance > selected_task_importance) ||
 984                                             ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
 985                                                 /*
 986                                                  * We have found a candidate process which is:
 987                                                  * a) at a higher importance than the current selected process
 988                                                  * OR
 989                                                  * b) has importance equal to that of the current selected process but is larger
 990                                                  */
 991
 992                                                 consider_knote = TRUE;
 993                                         }
 994                                 }
 995                         } else if (level == 0) {
 996                                 /*
 997                                  * Pressure back to normal.
 998                                  */
 999                                 if ((curr_task_importance > selected_task_importance) ||
1000                                     ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1001                                         consider_knote = TRUE;
1002                                 }
1003                         }
1004
1005                         if (consider_knote) {
1006                                 resident_max = resident_size;
1007                                 kn_max = kn;
1008                                 selected_task_importance = curr_task_importance;
1009                                 consider_knote = FALSE; /* reset for the next candidate */
1010                         }
1011                 } else {
1012                         /* There was no candidate with enough resident memory to scavenge */
1013                         VM_PRESSURE_DEBUG(0, "[vm_pressure] threshold failed for pid %d with %llu resident...\n", p->p_pid, resident_size);
1014                 }
1015                 proc_rele(p);
1016         }
1017
1018 done_scanning:
1019         if (kn_max) {
1020                 VM_DEBUG_CONSTANT_EVENT(vm_pressure_event, VM_PRESSURE_EVENT, DBG_FUNC_NONE, knote_get_kq(kn_max)->kq_p->p_pid, resident_max, 0, 0);
1021                 VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %llu resident\n", knote_get_kq(kn_max)->kq_p->p_pid, resident_max);
1022         }
1023
1024         return kn_max;
1025 }
1026
1027 static uint64_t next_warning_notification_sent_at_ts = 0;
1028 static uint64_t next_critical_notification_sent_at_ts = 0;
1029
1030 boolean_t        memorystatus_manual_testing_on = FALSE;
1031 vm_pressure_level_t    memorystatus_manual_testing_level = kVMPressureNormal;
1032
1033 kern_return_t
1034 memorystatus_update_vm_pressure(boolean_t target_foreground_process)
1035 {
1036         struct knote            *kn_max = NULL;
1037         struct knote            *kn_cur = NULL, *kn_temp = NULL;/* for safe list traversal */
1038         pid_t                target_pid = -1;
1039         struct klist            dispatch_klist = { NULL };
1040         proc_t                target_proc = PROC_NULL;
1041         struct task            *task = NULL;
1042         boolean_t            found_candidate = FALSE;
1043
1044         static vm_pressure_level_t     level_snapshot = kVMPressureNormal;
1045         static vm_pressure_level_t    prev_level_snapshot = kVMPressureNormal;
1046         boolean_t            smoothing_window_started = FALSE;
1047         struct timeval            smoothing_window_start_tstamp = {0, 0};
1048         struct timeval            curr_tstamp = {0, 0};
1049         int64_t              elapsed_msecs = 0;
1050         uint64_t             curr_ts = mach_absolute_time();
1051
1052 #if !CONFIG_JETSAM
1053 #define MAX_IDLE_KILLS 100    /* limit the number of idle kills allowed */
1054
1055         int    idle_kill_counter = 0;
1056
1057         /*
1058          * On desktop we take this opportunity to free up memory pressure
1059          * by immediately killing idle exitable processes. We use a delay
1060          * to avoid overkill.  And we impose a max counter as a fail safe
1061          * in case daemons re-launch too fast.
1062          */
1063         while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) {
1064                 if (memorystatus_idle_exit_from_VM() == FALSE) {
1065                         /* No idle exitable processes left to kill */
1066                         break;
1067                 }
1068                 idle_kill_counter++;
1069
1070                 if (memorystatus_manual_testing_on == TRUE) {
1071                         /*
1072                          * Skip the delay when testing
1073                          * the pressure notification scheme.
1074                          */
1075                 } else {
1076                         delay(1000000); /* 1 second */
1077                 }
1078         }
1079 #endif /* !CONFIG_JETSAM */
1080
1081         if (level_snapshot != kVMPressureNormal) {
1082                 /*
1083                  * Check to see if we are still in the 'resting' period
1084                  * after having notified all clients interested in
1085                  * a particular pressure level.
1086                  */
1087
1088                 level_snapshot = memorystatus_vm_pressure_level;
1089
1090                 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1091                         if (next_warning_notification_sent_at_ts) {
1092                                 if (curr_ts < next_warning_notification_sent_at_ts) {
1093                                         delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
1094                                         return KERN_SUCCESS;
1095                                 }
1096
1097                                 next_warning_notification_sent_at_ts = 0;
1098                                 memorystatus_klist_reset_all_for_level(kVMPressureWarning);
1099                         }
1100                 } else if (level_snapshot == kVMPressureCritical) {
1101                         if (next_critical_notification_sent_at_ts) {
1102                                 if (curr_ts < next_critical_notification_sent_at_ts) {
1103                                         delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
1104                                         return KERN_SUCCESS;
1105                                 }
1106                                 next_critical_notification_sent_at_ts = 0;
1107                                 memorystatus_klist_reset_all_for_level(kVMPressureCritical);
1108                         }
1109                 }
1110         }
1111
1112         while (1) {
1113                 /*
1114                  * There is a race window here. But it's not clear
1115                  * how much we benefit from having extra synchronization.
1116                  */
1117                 level_snapshot = memorystatus_vm_pressure_level;
1118
1119                 if (prev_level_snapshot > level_snapshot) {
1120                         /*
1121                          * Pressure decreased? Let's take a little breather
1122                          * and see if this condition stays.
1123                          */
1124                         if (smoothing_window_started == FALSE) {
1125                                 smoothing_window_started = TRUE;
1126                                 microuptime(&smoothing_window_start_tstamp);
1127                         }
1128
1129                         microuptime(&curr_tstamp);
1130                         timevalsub(&curr_tstamp, &smoothing_window_start_tstamp);
1131                         elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
1132
1133                         if (elapsed_msecs < VM_PRESSURE_DECREASED_SMOOTHING_PERIOD) {
1134                                 delay(INTER_NOTIFICATION_DELAY);
1135                                 continue;
1136                         }
1137                 }
1138
1139                 prev_level_snapshot = level_snapshot;
1140                 smoothing_window_started = FALSE;
1141
1142                 memorystatus_klist_lock();
1143                 kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot, target_foreground_process);
1144
1145                 if (kn_max == NULL) {
1146                         memorystatus_klist_unlock();
1147
1148                         /*
1149                          * No more level-based clients to notify.
1150                          *
1151                          * Start the 'resting' window within which clients will not be re-notified.
1152                          */
1153
1154                         if (level_snapshot != kVMPressureNormal) {
1155                                 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1156                                         nanoseconds_to_absolutetime(WARNING_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
1157
1158                                         /* Next warning notification (if nothing changes) won't be sent before...*/
1159                                         next_warning_notification_sent_at_ts = mach_absolute_time() + curr_ts;
1160                                 }
1161
1162                                 if (level_snapshot == kVMPressureCritical) {
1163                                         nanoseconds_to_absolutetime(CRITICAL_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
1164
1165                                         /* Next critical notification (if nothing changes) won't be sent before...*/
1166                                         next_critical_notification_sent_at_ts = mach_absolute_time() + curr_ts;
1167                                 }
1168                         }
1169                         return KERN_FAILURE;
1170                 }
1171
1172                 target_proc = knote_get_kq(kn_max)->kq_p;
1173
1174                 proc_list_lock();
1175                 if (target_proc != proc_ref_locked(target_proc)) {
1176                         target_proc = PROC_NULL;
1177                         proc_list_unlock();
1178                         memorystatus_klist_unlock();
1179                         continue;
1180                 }
1181                 proc_list_unlock();
1182
1183                 target_pid = target_proc->p_pid;
1184
1185                 task = (struct task *)(target_proc->task);
1186
1187                 if (level_snapshot != kVMPressureNormal) {
1188                         if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1189                                 if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, 0, kVMPressureWarning) == TRUE) {
1190                                         found_candidate = TRUE;
1191                                 }
1192                         } else {
1193                                 if (level_snapshot == kVMPressureCritical) {
1194                                         if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, 0, kVMPressureCritical) == TRUE) {
1195                                                 found_candidate = TRUE;
1196                                         }
1197                                 }
1198                         }
1199                 } else {
1200                         if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1201                                 task_clear_has_been_notified(task, kVMPressureWarning);
1202                                 task_clear_has_been_notified(task, kVMPressureCritical);
1203
1204                                 found_candidate = TRUE;
1205                         }
1206                 }
1207
1208                 if (found_candidate == FALSE) {
1209                         proc_rele(target_proc);
1210                         memorystatus_klist_unlock();
1211                         continue;
1212                 }
1213
1214                 SLIST_FOREACH_SAFE(kn_cur, &memorystatus_klist, kn_selnext, kn_temp) {
1215                         int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level_snapshot);
1216
1217                         if (is_knote_registered_modify_task_pressure_bits(kn_cur, knote_pressure_level, task, 0, level_snapshot) == TRUE) {
1218                                 proc_t knote_proc = knote_get_kq(kn_cur)->kq_p;
1219                                 pid_t knote_pid = knote_proc->p_pid;
1220                                 if (knote_pid == target_pid) {
1221                                         KNOTE_DETACH(&memorystatus_klist, kn_cur);
1222                                         KNOTE_ATTACH(&dispatch_klist, kn_cur);
1223                                 }
1224                         }
1225                 }
1226
1227                 KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure);
1228
1229                 SLIST_FOREACH_SAFE(kn_cur, &dispatch_klist, kn_selnext, kn_temp) {
1230                         KNOTE_DETACH(&dispatch_klist, kn_cur);
1231                         KNOTE_ATTACH(&memorystatus_klist, kn_cur);
1232                 }
1233
1234                 memorystatus_klist_unlock();
1235
1236                 microuptime(&target_proc->vm_pressure_last_notify_tstamp);
1237                 proc_rele(target_proc);
1238
1239                 if (memorystatus_manual_testing_on == TRUE && target_foreground_process == TRUE) {
1240                         break;
1241                 }
1242
1243                 if (memorystatus_manual_testing_on == TRUE) {
1244                         /*
1245                          * Testing out the pressure notification scheme.
1246                          * No need for delays etc.
1247                          */
1248                 } else {
1249                         uint32_t sleep_interval = INTER_NOTIFICATION_DELAY;
1250 #if CONFIG_JETSAM
1251                         unsigned int page_delta = 0;
1252                         unsigned int skip_delay_page_threshold = 0;
1253
1254                         assert(memorystatus_available_pages_pressure >= memorystatus_available_pages_critical_base);
1255
1256                         page_delta = (memorystatus_available_pages_pressure - memorystatus_available_pages_critical_base) / 2;
1257                         skip_delay_page_threshold = memorystatus_available_pages_pressure - page_delta;
1258
1259                         if (memorystatus_available_pages <= skip_delay_page_threshold) {
1260                                 /*
1261                                  * We are nearing the critcal mark fast and can't afford to wait between
1262                                  * notifications.
1263                                  */
1264                                 sleep_interval = 0;
1265                         }
1266 #endif /* CONFIG_JETSAM */
1267
1268                         if (sleep_interval) {
1269                                 delay(sleep_interval);
1270                         }
1271                 }
1272         }
1273
1274         return KERN_SUCCESS;
1275 }
1276
1277 static uint32_t
1278 convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)
1279 {
1280         uint32_t dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
1281
1282         switch (internal_pressure_level) {
1283         case kVMPressureNormal:
1284         {
1285                 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
1286                 break;
1287         }
1288
1289         case kVMPressureWarning:
1290         case kVMPressureUrgent:
1291         {
1292                 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN;
1293                 break;
1294         }
1295
1296         case kVMPressureCritical:
1297         {
1298                 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
1299                 break;
1300         }
1301
1302         default:
1303                 break;
1304         }
1305
1306         return dispatch_level;
1307 }
1308
1309 /*
1310  * Notify any kexts that are waiting for notification that jetsam
1311  * is approaching the foreground bands. They should use this notification
1312  * to free cached memory.
1313  */
1314 void
1315 memorystatus_issue_fg_band_notify(void)
1316 {
1317         uint64_t now;
1318
1319         lck_mtx_lock(&memorystatus_jetsam_fg_band_lock);
1320         absolutetime_to_nanoseconds(mach_absolute_time(), &now);
1321         if (now - memorystatus_jetsam_fg_band_timestamp_ns < memorystatus_jetsam_fg_band_delay_ns) {
1322                 lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
1323                 return;
1324         }
1325
1326         if (memorystatus_jetsam_fg_band_waiters > 0) {
1327                 thread_wakeup(&memorystatus_jetsam_fg_band_waiters);
1328                 memorystatus_jetsam_fg_band_waiters = 0;
1329                 memorystatus_jetsam_fg_band_timestamp_ns = now;
1330         }
1331         lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
1332
1333         /* Notify the buffer cache, file systems, etc. to jetison everything they can. */
1334         if (consider_buffer_cache_collect != NULL) {
1335                 (void)(*consider_buffer_cache_collect)(1);
1336         }
1337 }
1338
1339
1340 /*
1341  * Memorystatus notification debugging support
1342  */
1343
1344 static int
1345 sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS
1346 {
1347 #pragma unused(arg1, arg2, oidp)
1348 #if !XNU_TARGET_OS_OSX
1349         int error = 0;
1350
1351         error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
1352         if (error) {
1353                 return error;
1354         }
1355
1356 #endif /* !XNU_TARGET_OS_OSX */
1357         uint32_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(memorystatus_vm_pressure_level);
1358
1359         return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level));
1360 }
1361
1362 #if DEBUG || DEVELOPMENT
1363
1364 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
1365     0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
1366
1367 #else /* DEBUG || DEVELOPMENT */
1368
1369 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1370     0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
1371
1372 #endif /* DEBUG || DEVELOPMENT */
1373
1374 /*
1375  * Trigger levels to test the mechanism.
1376  * Can be used via a sysctl.
1377  */
1378 #define TEST_LOW_MEMORY_TRIGGER_ONE        1
1379 #define TEST_LOW_MEMORY_TRIGGER_ALL        2
1380 #define TEST_PURGEABLE_TRIGGER_ONE        3
1381 #define TEST_PURGEABLE_TRIGGER_ALL        4
1382 #define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE    5
1383 #define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL    6
1384
1385 static int
1386 sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
1387 {
1388 #pragma unused(arg1, arg2)
1389
1390         int level = 0;
1391         int error = 0;
1392         int pressure_level = 0;
1393         int trigger_request = 0;
1394         int force_purge;
1395
1396         error = sysctl_handle_int(oidp, &level, 0, req);
1397         if (error || !req->newptr) {
1398                 return error;
1399         }
1400
1401         memorystatus_manual_testing_on = TRUE;
1402
1403         trigger_request = (level >> 16) & 0xFFFF;
1404         pressure_level = (level & 0xFFFF);
1405
1406         if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE ||
1407             trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) {
1408                 return EINVAL;
1409         }
1410         switch (pressure_level) {
1411         case NOTE_MEMORYSTATUS_PRESSURE_NORMAL:
1412         case NOTE_MEMORYSTATUS_PRESSURE_WARN:
1413         case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL:
1414                 break;
1415         default:
1416                 return EINVAL;
1417         }
1418
1419         /*
1420          * The pressure level is being set from user-space.
1421          * And user-space uses the constants in sys/event.h
1422          * So we translate those events to our internal levels here.
1423          */
1424         if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1425                 memorystatus_manual_testing_level = kVMPressureNormal;
1426                 force_purge = 0;
1427         } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) {
1428                 memorystatus_manual_testing_level = kVMPressureWarning;
1429                 force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1430         } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
1431                 memorystatus_manual_testing_level = kVMPressureCritical;
1432                 force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1433         }
1434
1435         memorystatus_vm_pressure_level = memorystatus_manual_testing_level;
1436
1437         /* purge according to the new pressure level */
1438         switch (trigger_request) {
1439         case TEST_PURGEABLE_TRIGGER_ONE:
1440         case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE:
1441                 if (force_purge == 0) {
1442                         /* no purging requested */
1443                         break;
1444                 }
1445                 vm_purgeable_object_purge_one_unlocked(force_purge);
1446                 break;
1447         case TEST_PURGEABLE_TRIGGER_ALL:
1448         case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL:
1449                 if (force_purge == 0) {
1450                         /* no purging requested */
1451                         break;
1452                 }
1453                 while (vm_purgeable_object_purge_one_unlocked(force_purge)) {
1454                         ;
1455                 }
1456                 break;
1457         }
1458
1459         if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) ||
1460             (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) {
1461                 memorystatus_update_vm_pressure(TRUE);
1462         }
1463
1464         if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) ||
1465             (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) {
1466                 while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) {
1467                         continue;
1468                 }
1469         }
1470
1471         if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1472                 memorystatus_manual_testing_on = FALSE;
1473         }
1474
1475         return 0;
1476 }
1477
1478 SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1479     0, 0, &sysctl_memorypressure_manual_trigger, "I", "");
1480
1481
1482 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_warning, 0, "");
1483 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_urgent, 0, "");
1484 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_critical, 0, "");
1485
1486 #if DEBUG || DEVELOPMENT
1487 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_events_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_events_enabled, 0, "");
1488
1489 #if 0
1490 #if CONFIG_JETSAM && VM_PRESSURE_EVENTS
1491 static boolean_t
1492 memorystatus_issue_pressure_kevent(boolean_t pressured)
1493 {
1494         memorystatus_klist_lock();
1495         KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure);
1496         memorystatus_klist_unlock();
1497         return TRUE;
1498 }
1499 #endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
1500 #endif /* 0 */
1501
1502 /*
1503  * This routine is used for targeted notifications regardless of system memory pressure
1504  * and regardless of whether or not the process has already been notified.
1505  * It bypasses and has no effect on the only-one-notification per soft-limit policy.
1506  *
1507  * "memnote" is the current user.
1508  */
1509
1510 static int
1511 sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS
1512 {
1513 #pragma unused(arg1, arg2)
1514         /* Need to be root or have memorystatus entitlement */
1515         if (!kauth_cred_issuser(kauth_cred_get()) && !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT)) {
1516                 return EPERM;
1517         }
1518
1519         int error = 0, pid = 0;
1520         struct knote *kn = NULL;
1521         boolean_t found_knote = FALSE;
1522         int fflags = 0;    /* filter flags for EVFILT_MEMORYSTATUS */
1523         uint64_t value = 0;
1524
1525         error = sysctl_handle_quad(oidp, &value, 0, req);
1526         if (error || !req->newptr) {
1527                 return error;
1528         }
1529
1530         /*
1531          * Find the pid in the low 32 bits of value passed in.
1532          */
1533         pid = (int)(value & 0xFFFFFFFF);
1534
1535         /*
1536          * Find notification in the high 32 bits of the value passed in.
1537          */
1538         fflags = (int)((value >> 32) & 0xFFFFFFFF);
1539
1540         /*
1541          * For backwards compatibility, when no notification is
1542          * passed in, default to the NOTE_MEMORYSTATUS_PRESSURE_WARN
1543          */
1544         if (fflags == 0) {
1545                 fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
1546                 // printf("memorystatus_vm_pressure_send: using default notification [0x%x]\n", fflags);
1547         }
1548
1549         /* wake up everybody waiting for kVMPressureJetsam */
1550         if (fflags == NOTE_MEMORYSTATUS_JETSAM_FG_BAND) {
1551                 memorystatus_issue_fg_band_notify();
1552                 return error;
1553         }
1554
1555         /*
1556          * See event.h ... fflags for EVFILT_MEMORYSTATUS
1557          */
1558         if (!((fflags == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) ||
1559             (fflags == NOTE_MEMORYSTATUS_PRESSURE_WARN) ||
1560             (fflags == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) ||
1561             (fflags == NOTE_MEMORYSTATUS_LOW_SWAP) ||
1562             (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) ||
1563             (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) ||
1564             (((fflags & NOTE_MEMORYSTATUS_MSL_STATUS) != 0 &&
1565             ((fflags & ~NOTE_MEMORYSTATUS_MSL_STATUS) == 0))))) {
1566                 printf("memorystatus_vm_pressure_send: notification [0x%x] not supported \n", fflags);
1567                 error = 1;
1568                 return error;
1569         }
1570
1571         /*
1572          * Forcibly send pid a memorystatus notification.
1573          */
1574
1575         memorystatus_klist_lock();
1576
1577         SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1578                 proc_t knote_proc = knote_get_kq(kn)->kq_p;
1579                 pid_t knote_pid = knote_proc->p_pid;
1580
1581                 if (knote_pid == pid) {
1582                         /*
1583                          * Forcibly send this pid a memorystatus notification.
1584                          */
1585                         kn->kn_fflags = fflags;
1586                         found_knote = TRUE;
1587                 }
1588         }
1589
1590         if (found_knote) {
1591                 KNOTE(&memorystatus_klist, 0);
1592                 printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] sent to process [%d] \n", value, fflags, pid);
1593                 error = 0;
1594         } else {
1595                 printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] not sent to process [%d] (none registered?)\n", value, fflags, pid);
1596                 error = 1;
1597         }
1598
1599         memorystatus_klist_unlock();
1600
1601         return error;
1602 }
1603
1604 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_ANYBODY,
1605     0, 0, &sysctl_memorystatus_vm_pressure_send, "Q", "");
1606
1607 #endif /* DEBUG || DEVELOPMENT */
1608
1609 #endif /* VM_PRESSURE_EVENTS */