bsd/kern/kern_memorystatus_notify.c

   1 /*
   2  * Copyright (c) 2006-2018 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  *
  28  */
  29
  30 #include <sys/kern_event.h>
  31 #include <kern/sched_prim.h>
  32 #include <kern/kalloc.h>
  33 #include <kern/assert.h>
  34 #include <kern/debug.h>
  35 #include <kern/locks.h>
  36 #include <kern/task.h>
  37 #include <kern/thread.h>
  38 #include <kern/host.h>
  39 #include <kern/policy_internal.h>
  40 #include <kern/thread_group.h>
  41
  42 #include <IOKit/IOBSD.h>
  43
  44 #include <libkern/libkern.h>
  45 #include <mach/coalition.h>
  46 #include <mach/mach_time.h>
  47 #include <mach/task.h>
  48 #include <mach/host_priv.h>
  49 #include <mach/mach_host.h>
  50 #include <os/log.h>
  51 #include <pexpert/pexpert.h>
  52 #include <sys/coalition.h>
  53 #include <sys/kern_event.h>
  54 #include <sys/proc.h>
  55 #include <sys/proc_info.h>
  56 #include <sys/reason.h>
  57 #include <sys/signal.h>
  58 #include <sys/signalvar.h>
  59 #include <sys/sysctl.h>
  60 #include <sys/sysproto.h>
  61 #include <sys/time.h>
  62 #include <sys/wait.h>
  63 #include <sys/tree.h>
  64 #include <sys/priv.h>
  65 #include <vm/vm_pageout.h>
  66 #include <vm/vm_protos.h>
  67 #include <mach/machine/sdt.h>
  68 #include <libkern/section_keywords.h>
  69 #include <stdatomic.h>
  70
  71 #if CONFIG_FREEZE
  72 #include <vm/vm_map.h>
  73 #endif /* CONFIG_FREEZE */
  74
  75 #include <sys/kern_memorystatus.h>
  76 #include <sys/kern_memorystatus_notify.h>
  77
  78 /*
  79  * Memorystatus klist structures
  80  */
  81 struct klist memorystatus_klist;
  82 static lck_mtx_t memorystatus_klist_mutex;
  83 static void memorystatus_klist_lock(void);
  84 static void memorystatus_klist_unlock(void);
  85
  86 /*
  87  * Memorystatus kevent filter routines
  88  */
  89 static int filt_memorystatusattach(struct knote *kn, struct kevent_qos_s *kev);
  90 static void filt_memorystatusdetach(struct knote *kn);
  91 static int filt_memorystatus(struct knote *kn, long hint);
  92 static int filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev);
  93 static int filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev);
  94
  95 SECURITY_READ_ONLY_EARLY(struct filterops) memorystatus_filtops = {
  96         .f_attach = filt_memorystatusattach,
  97         .f_detach = filt_memorystatusdetach,
  98         .f_event = filt_memorystatus,
  99         .f_touch = filt_memorystatustouch,
 100         .f_process = filt_memorystatusprocess,
 101 };
 102
 103 /*
 104  * Memorystatus notification events
 105  */
 106 enum {
 107         kMemorystatusNoPressure = 0x1,
 108         kMemorystatusPressure = 0x2,
 109         kMemorystatusLowSwap = 0x4,
 110         kMemorystatusProcLimitWarn = 0x8,
 111         kMemorystatusProcLimitCritical = 0x10
 112 };
 113
 114 #define INTER_NOTIFICATION_DELAY    (250000)    /* .25 second */
 115 #define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD        5000    /* milliseconds */
 116 #define WARNING_NOTIFICATION_RESTING_PERIOD        25    /* seconds */
 117 #define CRITICAL_NOTIFICATION_RESTING_PERIOD        25    /* seconds */
 118
 119 /*
 120  * Memorystatus notification helper routines
 121  */
 122 static vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
 123 static boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
 124 static void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
 125 static struct knote *vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process);
 126 static void vm_dispatch_memory_pressure(void);
 127 kern_return_t memorystatus_update_vm_pressure(boolean_t target_foreground_process);
 128
 129 #if VM_PRESSURE_EVENTS
 130
 131 /*
 132  * This value is the threshold that a process must meet to be considered for scavenging.
 133  */
 134 #if CONFIG_EMBEDDED
 135 #define VM_PRESSURE_MINIMUM_RSIZE        6    /* MB */
 136 #else /* CONFIG_EMBEDDED */
 137 #define VM_PRESSURE_MINIMUM_RSIZE        10    /* MB */
 138 #endif /* CONFIG_EMBEDDED */
 139
 140 static uint32_t vm_pressure_task_footprint_min = VM_PRESSURE_MINIMUM_RSIZE;
 141
 142 #if DEVELOPMENT || DEBUG
 143 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_task_footprint_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_task_footprint_min, 0, "");
 144 #endif /* DEVELOPMENT || DEBUG */
 145
 146 vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal;
 147
 148 /*
 149  * We use this flag to signal if we have any HWM offenders
 150  * on the system. This way we can reduce the number of wakeups
 151  * of the memorystatus_thread when the system is between the
 152  * "pressure" and "critical" threshold.
 153  *
 154  * The (re-)setting of this variable is done without any locks
 155  * or synchronization simply because it is not possible (currently)
 156  * to keep track of HWM offenders that drop down below their memory
 157  * limit and/or exit. So, we choose to burn a couple of wasted wakeups
 158  * by allowing the unguarded modification of this variable.
 159  */
 160 boolean_t memorystatus_hwm_candidates = 0;
 161
 162 #endif /* VM_PRESSURE_EVENTS */
 163
 164 #if CONFIG_JETSAM
 165
 166 extern unsigned int memorystatus_available_pages;
 167 extern unsigned int memorystatus_available_pages_pressure;
 168 extern unsigned int memorystatus_available_pages_critical;
 169 extern unsigned int memorystatus_available_pages_critical_base;
 170 extern unsigned int memorystatus_available_pages_critical_idle_offset;
 171
 172 #else /* CONFIG_JETSAM */
 173
 174 extern uint64_t memorystatus_available_pages;
 175 extern uint64_t memorystatus_available_pages_pressure;
 176 extern uint64_t memorystatus_available_pages_critical;
 177
 178 #endif /* CONFIG_JETSAM */
 179
 180 extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
 181 uint32_t memorystatus_jetsam_fg_band_waiters = 0;
 182 static uint64_t memorystatus_jetsam_fg_band_timestamp_ns = 0; /* nanosec */
 183 static uint64_t memorystatus_jetsam_fg_band_delay_ns = 5ull * 1000 * 1000 * 1000; /* nanosec */
 184
 185 extern boolean_t(*volatile consider_buffer_cache_collect)(int);
 186
 187 #if DEVELOPMENT || DEBUG
 188 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_jetsam_fg_band_delay_ns, CTLFLAG_RW | CTLFLAG_LOCKED,
 189     &memorystatus_jetsam_fg_band_delay_ns, "");
 190 #endif
 191
 192 static int
 193 filt_memorystatusattach(struct knote *kn, __unused struct kevent_qos_s *kev)
 194 {
 195         int error;
 196
 197         kn->kn_flags |= EV_CLEAR; /* automatically set */
 198         kn->kn_sdata = 0;         /* incoming data is ignored */
 199
 200         error = memorystatus_knote_register(kn);
 201         if (error) {
 202                 knote_set_error(kn, error);
 203         }
 204         return 0;
 205 }
 206
 207 static void
 208 filt_memorystatusdetach(struct knote *kn)
 209 {
 210         memorystatus_knote_unregister(kn);
 211 }
 212
 213 static int
 214 filt_memorystatus(struct knote *kn __unused, long hint)
 215 {
 216         if (hint) {
 217                 switch (hint) {
 218                 case kMemorystatusNoPressure:
 219                         if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
 220                                 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
 221                         }
 222                         break;
 223                 case kMemorystatusPressure:
 224                         if (memorystatus_vm_pressure_level == kVMPressureWarning || memorystatus_vm_pressure_level == kVMPressureUrgent) {
 225                                 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
 226                                         kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
 227                                 }
 228                         } else if (memorystatus_vm_pressure_level == kVMPressureCritical) {
 229                                 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
 230                                         kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
 231                                 }
 232                         }
 233                         break;
 234                 case kMemorystatusLowSwap:
 235                         if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) {
 236                                 kn->kn_fflags = NOTE_MEMORYSTATUS_LOW_SWAP;
 237                         }
 238                         break;
 239
 240                 case kMemorystatusProcLimitWarn:
 241                         if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
 242                                 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
 243                         }
 244                         break;
 245
 246                 case kMemorystatusProcLimitCritical:
 247                         if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
 248                                 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
 249                         }
 250                         break;
 251
 252                 default:
 253                         break;
 254                 }
 255         }
 256
 257 #if 0
 258         if (kn->kn_fflags != 0) {
 259                 proc_t knote_proc = knote_get_kq(kn)->kq_p;
 260                 pid_t knote_pid = knote_proc->p_pid;
 261
 262                 printf("filt_memorystatus: sending kn 0x%lx (event 0x%x) for pid (%d)\n",
 263                     (unsigned long)kn, kn->kn_fflags, knote_pid);
 264         }
 265 #endif
 266
 267         return kn->kn_fflags != 0;
 268 }
 269
 270 static int
 271 filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev)
 272 {
 273         int res;
 274         int prev_kn_sfflags = 0;
 275
 276         memorystatus_klist_lock();
 277
 278         /*
 279          * copy in new kevent settings
 280          * (saving the "desired" data and fflags).
 281          */
 282
 283         prev_kn_sfflags = kn->kn_sfflags;
 284         kn->kn_sfflags = (kev->fflags & EVFILT_MEMORYSTATUS_ALL_MASK);
 285
 286 #if !CONFIG_EMBEDDED
 287         /*
 288          * Only on desktop do we restrict notifications to
 289          * one per active/inactive state (soft limits only).
 290          */
 291         if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
 292                 /*
 293                  * Is there previous state to preserve?
 294                  */
 295                 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
 296                         /*
 297                          * This knote was previously interested in proc_limit_warn,
 298                          * so yes, preserve previous state.
 299                          */
 300                         if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
 301                                 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
 302                         }
 303                         if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
 304                                 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
 305                         }
 306                 } else {
 307                         /*
 308                          * This knote was not previously interested in proc_limit_warn,
 309                          * but it is now.  Set both states.
 310                          */
 311                         kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
 312                         kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
 313                 }
 314         }
 315
 316         if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
 317                 /*
 318                  * Is there previous state to preserve?
 319                  */
 320                 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
 321                         /*
 322                          * This knote was previously interested in proc_limit_critical,
 323                          * so yes, preserve previous state.
 324                          */
 325                         if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
 326                                 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
 327                         }
 328                         if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
 329                                 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
 330                         }
 331                 } else {
 332                         /*
 333                          * This knote was not previously interested in proc_limit_critical,
 334                          * but it is now.  Set both states.
 335                          */
 336                         kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
 337                         kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
 338                 }
 339         }
 340 #endif /* !CONFIG_EMBEDDED */
 341
 342         /*
 343          * reset the output flags based on a
 344          * combination of the old events and
 345          * the new desired event list.
 346          */
 347         //kn->kn_fflags &= kn->kn_sfflags;
 348
 349         res = (kn->kn_fflags != 0);
 350
 351         memorystatus_klist_unlock();
 352
 353         return res;
 354 }
 355
 356 static int
 357 filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev)
 358 {
 359         int res = 0;
 360
 361         memorystatus_klist_lock();
 362         if (kn->kn_fflags) {
 363                 knote_fill_kevent(kn, kev, 0);
 364                 res = 1;
 365         }
 366         memorystatus_klist_unlock();
 367
 368         return res;
 369 }
 370
 371 static void
 372 memorystatus_klist_lock(void)
 373 {
 374         lck_mtx_lock(&memorystatus_klist_mutex);
 375 }
 376
 377 static void
 378 memorystatus_klist_unlock(void)
 379 {
 380         lck_mtx_unlock(&memorystatus_klist_mutex);
 381 }
 382
 383 void
 384 memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr)
 385 {
 386         lck_mtx_init(&memorystatus_klist_mutex, grp, attr);
 387         klist_init(&memorystatus_klist);
 388 }
 389
 390 int
 391 memorystatus_knote_register(struct knote *kn)
 392 {
 393         int error = 0;
 394
 395         memorystatus_klist_lock();
 396
 397         /*
 398          * Support only userspace visible flags.
 399          */
 400         if ((kn->kn_sfflags & EVFILT_MEMORYSTATUS_ALL_MASK) == (unsigned int) kn->kn_sfflags) {
 401 #if !CONFIG_EMBEDDED
 402                 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
 403                         kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
 404                         kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
 405                 }
 406
 407                 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
 408                         kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
 409                         kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
 410                 }
 411 #endif /* !CONFIG_EMBEDDED */
 412
 413                 KNOTE_ATTACH(&memorystatus_klist, kn);
 414         } else {
 415                 error = ENOTSUP;
 416         }
 417
 418         memorystatus_klist_unlock();
 419
 420         return error;
 421 }
 422
 423 void
 424 memorystatus_knote_unregister(struct knote *kn __unused)
 425 {
 426         memorystatus_klist_lock();
 427         KNOTE_DETACH(&memorystatus_klist, kn);
 428         memorystatus_klist_unlock();
 429 }
 430
 431 #if VM_PRESSURE_EVENTS
 432
 433 #if CONFIG_MEMORYSTATUS
 434
 435 int
 436 memorystatus_send_note(int event_code, void *data, size_t data_length)
 437 {
 438         int ret;
 439         struct kev_msg ev_msg;
 440
 441         ev_msg.vendor_code    = KEV_VENDOR_APPLE;
 442         ev_msg.kev_class      = KEV_SYSTEM_CLASS;
 443         ev_msg.kev_subclass   = KEV_MEMORYSTATUS_SUBCLASS;
 444
 445         ev_msg.event_code     = event_code;
 446
 447         ev_msg.dv[0].data_length = data_length;
 448         ev_msg.dv[0].data_ptr = data;
 449         ev_msg.dv[1].data_length = 0;
 450
 451         ret = kev_post_msg(&ev_msg);
 452         if (ret) {
 453                 printf("%s: kev_post_msg() failed, err %d\n", __func__, ret);
 454         }
 455
 456         return ret;
 457 }
 458
 459 boolean_t
 460 memorystatus_warn_process(pid_t pid, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t limit_exceeded)
 461 {
 462         boolean_t ret = FALSE;
 463         boolean_t found_knote = FALSE;
 464         struct knote *kn = NULL;
 465         int send_knote_count = 0;
 466
 467         /*
 468          * See comment in sysctl_memorystatus_vm_pressure_send.
 469          */
 470
 471         memorystatus_klist_lock();
 472
 473         SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
 474                 proc_t knote_proc = knote_get_kq(kn)->kq_p;
 475                 pid_t knote_pid = knote_proc->p_pid;
 476
 477                 if (knote_pid == pid) {
 478                         /*
 479                          * By setting the "fflags" here, we are forcing
 480                          * a process to deal with the case where it's
 481                          * bumping up into its memory limits. If we don't
 482                          * do this here, we will end up depending on the
 483                          * system pressure snapshot evaluation in
 484                          * filt_memorystatus().
 485                          */
 486
 487 #if CONFIG_EMBEDDED
 488                         if (!limit_exceeded) {
 489                                 /*
 490                                  * Intentionally set either the unambiguous limit warning,
 491                                  * the system-wide critical or the system-wide warning
 492                                  * notification bit.
 493                                  */
 494
 495                                 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
 496                                         kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
 497                                         found_knote = TRUE;
 498                                         send_knote_count++;
 499                                 } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
 500                                         kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
 501                                         found_knote = TRUE;
 502                                         send_knote_count++;
 503                                 } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
 504                                         kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
 505                                         found_knote = TRUE;
 506                                         send_knote_count++;
 507                                 }
 508                         } else {
 509                                 /*
 510                                  * Send this notification when a process has exceeded a soft limit.
 511                                  */
 512                                 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
 513                                         kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
 514                                         found_knote = TRUE;
 515                                         send_knote_count++;
 516                                 }
 517                         }
 518 #else /* CONFIG_EMBEDDED */
 519                         if (!limit_exceeded) {
 520                                 /*
 521                                  * Processes on desktop are not expecting to handle a system-wide
 522                                  * critical or system-wide warning notification from this path.
 523                                  * Intentionally set only the unambiguous limit warning here.
 524                                  *
 525                                  * If the limit is soft, however, limit this to one notification per
 526                                  * active/inactive limit (per each registered listener).
 527                                  */
 528
 529                                 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
 530                                         found_knote = TRUE;
 531                                         if (!is_fatal) {
 532                                                 /*
 533                                                  * Restrict proc_limit_warn notifications when
 534                                                  * non-fatal (soft) limit is at play.
 535                                                  */
 536                                                 if (is_active) {
 537                                                         if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
 538                                                                 /*
 539                                                                  * Mark this knote for delivery.
 540                                                                  */
 541                                                                 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
 542                                                                 /*
 543                                                                  * And suppress it from future notifications.
 544                                                                  */
 545                                                                 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
 546                                                                 send_knote_count++;
 547                                                         }
 548                                                 } else {
 549                                                         if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
 550                                                                 /*
 551                                                                  * Mark this knote for delivery.
 552                                                                  */
 553                                                                 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
 554                                                                 /*
 555                                                                  * And suppress it from future notifications.
 556                                                                  */
 557                                                                 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
 558                                                                 send_knote_count++;
 559                                                         }
 560                                                 }
 561                                         } else {
 562                                                 /*
 563                                                  * No restriction on proc_limit_warn notifications when
 564                                                  * fatal (hard) limit is at play.
 565                                                  */
 566                                                 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
 567                                                 send_knote_count++;
 568                                         }
 569                                 }
 570                         } else {
 571                                 /*
 572                                  * Send this notification when a process has exceeded a soft limit,
 573                                  */
 574
 575                                 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
 576                                         found_knote = TRUE;
 577                                         if (!is_fatal) {
 578                                                 /*
 579                                                  * Restrict critical notifications for soft limits.
 580                                                  */
 581
 582                                                 if (is_active) {
 583                                                         if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
 584                                                                 /*
 585                                                                  * Suppress future proc_limit_critical notifications
 586                                                                  * for the active soft limit.
 587                                                                  */
 588                                                                 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
 589                                                                 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
 590                                                                 send_knote_count++;
 591                                                         }
 592                                                 } else {
 593                                                         if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
 594                                                                 /*
 595                                                                  * Suppress future proc_limit_critical_notifications
 596                                                                  * for the inactive soft limit.
 597                                                                  */
 598                                                                 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
 599                                                                 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
 600                                                                 send_knote_count++;
 601                                                         }
 602                                                 }
 603                                         } else {
 604                                                 /*
 605                                                  * We should never be trying to send a critical notification for
 606                                                  * a hard limit... the process would be killed before it could be
 607                                                  * received.
 608                                                  */
 609                                                 panic("Caught sending pid %d a critical warning for a fatal limit.\n", pid);
 610                                         }
 611                                 }
 612                         }
 613 #endif /* CONFIG_EMBEDDED */
 614                 }
 615         }
 616
 617         if (found_knote) {
 618                 if (send_knote_count > 0) {
 619                         KNOTE(&memorystatus_klist, 0);
 620                 }
 621                 ret = TRUE;
 622         }
 623
 624         memorystatus_klist_unlock();
 625
 626         return ret;
 627 }
 628
 629 /*
 630  * Can only be set by the current task on itself.
 631  */
 632 int
 633 memorystatus_low_mem_privileged_listener(uint32_t op_flags)
 634 {
 635         boolean_t set_privilege = FALSE;
 636         /*
 637          * Need an entitlement check here?
 638          */
 639         if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE) {
 640                 set_privilege = TRUE;
 641         } else if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE) {
 642                 set_privilege = FALSE;
 643         } else {
 644                 return EINVAL;
 645         }
 646
 647         return task_low_mem_privileged_listener(current_task(), set_privilege, NULL);
 648 }
 649
 650 int
 651 memorystatus_send_pressure_note(pid_t pid)
 652 {
 653         MEMORYSTATUS_DEBUG(1, "memorystatus_send_pressure_note(): pid %d\n", pid);
 654         return memorystatus_send_note(kMemorystatusPressureNote, &pid, sizeof(pid));
 655 }
 656
 657 boolean_t
 658 memorystatus_is_foreground_locked(proc_t p)
 659 {
 660         return (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) ||
 661                (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT);
 662 }
 663
 664 /*
 665  * This is meant for stackshot and kperf -- it does not take the proc_list_lock
 666  * to access the p_memstat_dirty field.
 667  */
 668 void
 669 memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit)
 670 {
 671         if (!v) {
 672                 *is_dirty = FALSE;
 673                 *is_dirty_tracked = FALSE;
 674                 *allow_idle_exit = FALSE;
 675         } else {
 676                 proc_t p = (proc_t)v;
 677                 *is_dirty = (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0;
 678                 *is_dirty_tracked = (p->p_memstat_dirty & P_DIRTY_TRACK) != 0;
 679                 *allow_idle_exit = (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) != 0;
 680         }
 681 }
 682
 683 boolean_t
 684 memorystatus_bg_pressure_eligible(proc_t p)
 685 {
 686         boolean_t eligible = FALSE;
 687
 688         proc_list_lock();
 689
 690         MEMORYSTATUS_DEBUG(1, "memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", p->p_pid, p->p_memstat_state);
 691
 692         /* Foreground processes have already been dealt with at this point, so just test for eligibility */
 693         if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) {
 694                 eligible = TRUE;
 695         }
 696
 697         if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_BACKGROUND_OPPORTUNISTIC) {
 698                 /*
 699                  * IDLE and IDLE_DEFERRED bands contain processes
 700                  * that have dropped memory to be under their inactive
 701                  * memory limits. And so they can't really give back
 702                  * anything.
 703                  */
 704                 eligible = FALSE;
 705         }
 706
 707         proc_list_unlock();
 708
 709         return eligible;
 710 }
 711
 712 void
 713 memorystatus_send_low_swap_note(void)
 714 {
 715         struct knote *kn = NULL;
 716
 717         memorystatus_klist_lock();
 718         SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
 719                 /* We call is_knote_registered_modify_task_pressure_bits to check if the sfflags for the
 720                  * current note contain NOTE_MEMORYSTATUS_LOW_SWAP. Once we find one note in the memorystatus_klist
 721                  * that has the NOTE_MEMORYSTATUS_LOW_SWAP flags in its sfflags set, we call KNOTE with
 722                  * kMemoryStatusLowSwap as the hint to process and update all knotes on the memorystatus_klist accordingly. */
 723                 if (is_knote_registered_modify_task_pressure_bits(kn, NOTE_MEMORYSTATUS_LOW_SWAP, NULL, 0, 0) == TRUE) {
 724                         KNOTE(&memorystatus_klist, kMemorystatusLowSwap);
 725                         break;
 726                 }
 727         }
 728
 729         memorystatus_klist_unlock();
 730 }
 731
 732 #endif /* CONFIG_MEMORYSTATUS */
 733
 734 /*
 735  * kn_max - knote
 736  *
 737  * knote_pressure_level - to check if the knote is registered for this notification level.
 738  *
 739  * task    - task whose bits we'll be modifying
 740  *
 741  * pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again.
 742  *
 743  * pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately.
 744  *
 745  */
 746
 747 static boolean_t
 748 is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set)
 749 {
 750         if (kn_max->kn_sfflags & knote_pressure_level) {
 751                 if (pressure_level_to_clear && task_has_been_notified(task, pressure_level_to_clear) == TRUE) {
 752                         task_clear_has_been_notified(task, pressure_level_to_clear);
 753                 }
 754
 755                 task_mark_has_been_notified(task, pressure_level_to_set);
 756                 return TRUE;
 757         }
 758
 759         return FALSE;
 760 }
 761
 762 static void
 763 memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear)
 764 {
 765         struct knote *kn = NULL;
 766
 767         memorystatus_klist_lock();
 768         SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
 769                 proc_t            p = PROC_NULL;
 770                 struct task*        t = TASK_NULL;
 771
 772                 p = knote_get_kq(kn)->kq_p;
 773                 proc_list_lock();
 774                 if (p != proc_ref_locked(p)) {
 775                         p = PROC_NULL;
 776                         proc_list_unlock();
 777                         continue;
 778                 }
 779                 proc_list_unlock();
 780
 781                 t = (struct task *)(p->task);
 782
 783                 task_clear_has_been_notified(t, pressure_level_to_clear);
 784
 785                 proc_rele(p);
 786         }
 787
 788         memorystatus_klist_unlock();
 789 }
 790
 791 /*
 792  * Used by the vm_pressure_thread which is
 793  * signalled from within vm_pageout_scan().
 794  */
 795
 796 void
 797 consider_vm_pressure_events(void)
 798 {
 799         vm_dispatch_memory_pressure();
 800 }
 801
 802 static void
 803 vm_dispatch_memory_pressure(void)
 804 {
 805         memorystatus_update_vm_pressure(FALSE);
 806 }
 807
 808 static struct knote *
 809 vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process)
 810 {
 811         struct knote    *kn = NULL, *kn_max = NULL;
 812         uint64_t    resident_max = 0;/* MB */
 813         struct timeval    curr_tstamp = {0, 0};
 814         int        elapsed_msecs = 0;
 815         int        selected_task_importance = 0;
 816         static int    pressure_snapshot = -1;
 817         boolean_t    pressure_increase = FALSE;
 818
 819         if (pressure_snapshot == -1) {
 820                 /*
 821                  * Initial snapshot.
 822                  */
 823                 pressure_snapshot = level;
 824                 pressure_increase = TRUE;
 825         } else {
 826                 if (level && (level >= pressure_snapshot)) {
 827                         pressure_increase = TRUE;
 828                 } else {
 829                         pressure_increase = FALSE;
 830                 }
 831
 832                 pressure_snapshot = level;
 833         }
 834
 835         if (pressure_increase == TRUE) {
 836                 /*
 837                  * We'll start by considering the largest
 838                  * unimportant task in our list.
 839                  */
 840                 selected_task_importance = INT_MAX;
 841         } else {
 842                 /*
 843                  * We'll start by considering the largest
 844                  * important task in our list.
 845                  */
 846                 selected_task_importance = 0;
 847         }
 848
 849         microuptime(&curr_tstamp);
 850
 851         SLIST_FOREACH(kn, candidate_list, kn_selnext) {
 852                 uint64_t        resident_size = 0;/* MB */
 853                 proc_t            p = PROC_NULL;
 854                 struct task*        t = TASK_NULL;
 855                 int            curr_task_importance = 0;
 856                 boolean_t        consider_knote = FALSE;
 857                 boolean_t        privileged_listener = FALSE;
 858
 859                 p = knote_get_kq(kn)->kq_p;
 860                 proc_list_lock();
 861                 if (p != proc_ref_locked(p)) {
 862                         p = PROC_NULL;
 863                         proc_list_unlock();
 864                         continue;
 865                 }
 866                 proc_list_unlock();
 867
 868 #if CONFIG_MEMORYSTATUS
 869                 if (target_foreground_process == TRUE && !memorystatus_is_foreground_locked(p)) {
 870                         /*
 871                          * Skip process not marked foreground.
 872                          */
 873                         proc_rele(p);
 874                         continue;
 875                 }
 876 #endif /* CONFIG_MEMORYSTATUS */
 877
 878                 t = (struct task *)(p->task);
 879
 880                 timevalsub(&curr_tstamp, &p->vm_pressure_last_notify_tstamp);
 881                 elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
 882
 883                 vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(level);
 884
 885                 if ((kn->kn_sfflags & dispatch_level) == 0) {
 886                         proc_rele(p);
 887                         continue;
 888                 }
 889
 890 #if CONFIG_MEMORYSTATUS
 891                 if (target_foreground_process == FALSE && !memorystatus_bg_pressure_eligible(p)) {
 892                         VM_PRESSURE_DEBUG(1, "[vm_pressure] skipping process %d\n", p->p_pid);
 893                         proc_rele(p);
 894                         continue;
 895                 }
 896 #endif /* CONFIG_MEMORYSTATUS */
 897
 898 #if CONFIG_EMBEDDED
 899                 curr_task_importance = p->p_memstat_effectivepriority;
 900 #else /* CONFIG_EMBEDDED */
 901                 curr_task_importance = task_importance_estimate(t);
 902 #endif /* CONFIG_EMBEDDED */
 903
 904                 /*
 905                  * Privileged listeners are only considered in the multi-level pressure scheme
 906                  * AND only if the pressure is increasing.
 907                  */
 908                 if (level > 0) {
 909                         if (task_has_been_notified(t, level) == FALSE) {
 910                                 /*
 911                                  * Is this a privileged listener?
 912                                  */
 913                                 if (task_low_mem_privileged_listener(t, FALSE, &privileged_listener) == 0) {
 914                                         if (privileged_listener) {
 915                                                 kn_max = kn;
 916                                                 proc_rele(p);
 917                                                 goto done_scanning;
 918                                         }
 919                                 }
 920                         } else {
 921                                 proc_rele(p);
 922                                 continue;
 923                         }
 924                 } else if (level == 0) {
 925                         /*
 926                          * Task wasn't notified when the pressure was increasing and so
 927                          * no need to notify it that the pressure is decreasing.
 928                          */
 929                         if ((task_has_been_notified(t, kVMPressureWarning) == FALSE) && (task_has_been_notified(t, kVMPressureCritical) == FALSE)) {
 930                                 proc_rele(p);
 931                                 continue;
 932                         }
 933                 }
 934
 935                 /*
 936                  * We don't want a small process to block large processes from
 937                  * being notified again. <rdar://problem/7955532>
 938                  */
 939                 resident_size = (get_task_phys_footprint(t)) / (1024 * 1024ULL); /* MB */
 940
 941                 if (resident_size >= vm_pressure_task_footprint_min) {
 942                         if (level > 0) {
 943                                 /*
 944                                  * Warning or Critical Pressure.
 945                                  */
 946                                 if (pressure_increase) {
 947                                         if ((curr_task_importance < selected_task_importance) ||
 948                                             ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
 949                                                 /*
 950                                                  * We have found a candidate process which is:
 951                                                  * a) at a lower importance than the current selected process
 952                                                  * OR
 953                                                  * b) has importance equal to that of the current selected process but is larger
 954                                                  */
 955
 956                                                 consider_knote = TRUE;
 957                                         }
 958                                 } else {
 959                                         if ((curr_task_importance > selected_task_importance) ||
 960                                             ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
 961                                                 /*
 962                                                  * We have found a candidate process which is:
 963                                                  * a) at a higher importance than the current selected process
 964                                                  * OR
 965                                                  * b) has importance equal to that of the current selected process but is larger
 966                                                  */
 967
 968                                                 consider_knote = TRUE;
 969                                         }
 970                                 }
 971                         } else if (level == 0) {
 972                                 /*
 973                                  * Pressure back to normal.
 974                                  */
 975                                 if ((curr_task_importance > selected_task_importance) ||
 976                                     ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
 977                                         consider_knote = TRUE;
 978                                 }
 979                         }
 980
 981                         if (consider_knote) {
 982                                 resident_max = resident_size;
 983                                 kn_max = kn;
 984                                 selected_task_importance = curr_task_importance;
 985                                 consider_knote = FALSE; /* reset for the next candidate */
 986                         }
 987                 } else {
 988                         /* There was no candidate with enough resident memory to scavenge */
 989                         VM_PRESSURE_DEBUG(0, "[vm_pressure] threshold failed for pid %d with %llu resident...\n", p->p_pid, resident_size);
 990                 }
 991                 proc_rele(p);
 992         }
 993
 994 done_scanning:
 995         if (kn_max) {
 996                 VM_DEBUG_CONSTANT_EVENT(vm_pressure_event, VM_PRESSURE_EVENT, DBG_FUNC_NONE, knote_get_kq(kn_max)->kq_p->p_pid, resident_max, 0, 0);
 997                 VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %llu resident\n", knote_get_kq(kn_max)->kq_p->p_pid, resident_max);
 998         }
 999
1000         return kn_max;
1001 }
1002
1003 static uint64_t next_warning_notification_sent_at_ts = 0;
1004 static uint64_t next_critical_notification_sent_at_ts = 0;
1005
1006 boolean_t        memorystatus_manual_testing_on = FALSE;
1007 vm_pressure_level_t    memorystatus_manual_testing_level = kVMPressureNormal;
1008
1009 kern_return_t
1010 memorystatus_update_vm_pressure(boolean_t target_foreground_process)
1011 {
1012         struct knote            *kn_max = NULL;
1013         struct knote            *kn_cur = NULL, *kn_temp = NULL;/* for safe list traversal */
1014         pid_t                target_pid = -1;
1015         struct klist            dispatch_klist = { NULL };
1016         proc_t                target_proc = PROC_NULL;
1017         struct task            *task = NULL;
1018         boolean_t            found_candidate = FALSE;
1019
1020         static vm_pressure_level_t     level_snapshot = kVMPressureNormal;
1021         static vm_pressure_level_t    prev_level_snapshot = kVMPressureNormal;
1022         boolean_t            smoothing_window_started = FALSE;
1023         struct timeval            smoothing_window_start_tstamp = {0, 0};
1024         struct timeval            curr_tstamp = {0, 0};
1025         int                elapsed_msecs = 0;
1026         uint64_t             curr_ts = mach_absolute_time();
1027
1028 #if !CONFIG_JETSAM
1029 #define MAX_IDLE_KILLS 100    /* limit the number of idle kills allowed */
1030
1031         int    idle_kill_counter = 0;
1032
1033         /*
1034          * On desktop we take this opportunity to free up memory pressure
1035          * by immediately killing idle exitable processes. We use a delay
1036          * to avoid overkill.  And we impose a max counter as a fail safe
1037          * in case daemons re-launch too fast.
1038          */
1039         while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) {
1040                 if (memorystatus_idle_exit_from_VM() == FALSE) {
1041                         /* No idle exitable processes left to kill */
1042                         break;
1043                 }
1044                 idle_kill_counter++;
1045
1046                 if (memorystatus_manual_testing_on == TRUE) {
1047                         /*
1048                          * Skip the delay when testing
1049                          * the pressure notification scheme.
1050                          */
1051                 } else {
1052                         delay(1000000); /* 1 second */
1053                 }
1054         }
1055 #endif /* !CONFIG_JETSAM */
1056
1057         if (level_snapshot != kVMPressureNormal) {
1058                 /*
1059                  * Check to see if we are still in the 'resting' period
1060                  * after having notified all clients interested in
1061                  * a particular pressure level.
1062                  */
1063
1064                 level_snapshot = memorystatus_vm_pressure_level;
1065
1066                 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1067                         if (next_warning_notification_sent_at_ts) {
1068                                 if (curr_ts < next_warning_notification_sent_at_ts) {
1069                                         delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
1070                                         return KERN_SUCCESS;
1071                                 }
1072
1073                                 next_warning_notification_sent_at_ts = 0;
1074                                 memorystatus_klist_reset_all_for_level(kVMPressureWarning);
1075                         }
1076                 } else if (level_snapshot == kVMPressureCritical) {
1077                         if (next_critical_notification_sent_at_ts) {
1078                                 if (curr_ts < next_critical_notification_sent_at_ts) {
1079                                         delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
1080                                         return KERN_SUCCESS;
1081                                 }
1082                                 next_critical_notification_sent_at_ts = 0;
1083                                 memorystatus_klist_reset_all_for_level(kVMPressureCritical);
1084                         }
1085                 }
1086         }
1087
1088         while (1) {
1089                 /*
1090                  * There is a race window here. But it's not clear
1091                  * how much we benefit from having extra synchronization.
1092                  */
1093                 level_snapshot = memorystatus_vm_pressure_level;
1094
1095                 if (prev_level_snapshot > level_snapshot) {
1096                         /*
1097                          * Pressure decreased? Let's take a little breather
1098                          * and see if this condition stays.
1099                          */
1100                         if (smoothing_window_started == FALSE) {
1101                                 smoothing_window_started = TRUE;
1102                                 microuptime(&smoothing_window_start_tstamp);
1103                         }
1104
1105                         microuptime(&curr_tstamp);
1106                         timevalsub(&curr_tstamp, &smoothing_window_start_tstamp);
1107                         elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
1108
1109                         if (elapsed_msecs < VM_PRESSURE_DECREASED_SMOOTHING_PERIOD) {
1110                                 delay(INTER_NOTIFICATION_DELAY);
1111                                 continue;
1112                         }
1113                 }
1114
1115                 prev_level_snapshot = level_snapshot;
1116                 smoothing_window_started = FALSE;
1117
1118                 memorystatus_klist_lock();
1119                 kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot, target_foreground_process);
1120
1121                 if (kn_max == NULL) {
1122                         memorystatus_klist_unlock();
1123
1124                         /*
1125                          * No more level-based clients to notify.
1126                          *
1127                          * Start the 'resting' window within which clients will not be re-notified.
1128                          */
1129
1130                         if (level_snapshot != kVMPressureNormal) {
1131                                 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1132                                         nanoseconds_to_absolutetime(WARNING_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
1133
1134                                         /* Next warning notification (if nothing changes) won't be sent before...*/
1135                                         next_warning_notification_sent_at_ts = mach_absolute_time() + curr_ts;
1136                                 }
1137
1138                                 if (level_snapshot == kVMPressureCritical) {
1139                                         nanoseconds_to_absolutetime(CRITICAL_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
1140
1141                                         /* Next critical notification (if nothing changes) won't be sent before...*/
1142                                         next_critical_notification_sent_at_ts = mach_absolute_time() + curr_ts;
1143                                 }
1144                         }
1145                         return KERN_FAILURE;
1146                 }
1147
1148                 target_proc = knote_get_kq(kn_max)->kq_p;
1149
1150                 proc_list_lock();
1151                 if (target_proc != proc_ref_locked(target_proc)) {
1152                         target_proc = PROC_NULL;
1153                         proc_list_unlock();
1154                         memorystatus_klist_unlock();
1155                         continue;
1156                 }
1157                 proc_list_unlock();
1158
1159                 target_pid = target_proc->p_pid;
1160
1161                 task = (struct task *)(target_proc->task);
1162
1163                 if (level_snapshot != kVMPressureNormal) {
1164                         if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1165                                 if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, 0, kVMPressureWarning) == TRUE) {
1166                                         found_candidate = TRUE;
1167                                 }
1168                         } else {
1169                                 if (level_snapshot == kVMPressureCritical) {
1170                                         if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, 0, kVMPressureCritical) == TRUE) {
1171                                                 found_candidate = TRUE;
1172                                         }
1173                                 }
1174                         }
1175                 } else {
1176                         if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1177                                 task_clear_has_been_notified(task, kVMPressureWarning);
1178                                 task_clear_has_been_notified(task, kVMPressureCritical);
1179
1180                                 found_candidate = TRUE;
1181                         }
1182                 }
1183
1184                 if (found_candidate == FALSE) {
1185                         proc_rele(target_proc);
1186                         memorystatus_klist_unlock();
1187                         continue;
1188                 }
1189
1190                 SLIST_FOREACH_SAFE(kn_cur, &memorystatus_klist, kn_selnext, kn_temp) {
1191                         int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level_snapshot);
1192
1193                         if (is_knote_registered_modify_task_pressure_bits(kn_cur, knote_pressure_level, task, 0, level_snapshot) == TRUE) {
1194                                 proc_t knote_proc = knote_get_kq(kn_cur)->kq_p;
1195                                 pid_t knote_pid = knote_proc->p_pid;
1196                                 if (knote_pid == target_pid) {
1197                                         KNOTE_DETACH(&memorystatus_klist, kn_cur);
1198                                         KNOTE_ATTACH(&dispatch_klist, kn_cur);
1199                                 }
1200                         }
1201                 }
1202
1203                 KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure);
1204
1205                 SLIST_FOREACH_SAFE(kn_cur, &dispatch_klist, kn_selnext, kn_temp) {
1206                         KNOTE_DETACH(&dispatch_klist, kn_cur);
1207                         KNOTE_ATTACH(&memorystatus_klist, kn_cur);
1208                 }
1209
1210                 memorystatus_klist_unlock();
1211
1212                 microuptime(&target_proc->vm_pressure_last_notify_tstamp);
1213                 proc_rele(target_proc);
1214
1215                 if (memorystatus_manual_testing_on == TRUE && target_foreground_process == TRUE) {
1216                         break;
1217                 }
1218
1219                 if (memorystatus_manual_testing_on == TRUE) {
1220                         /*
1221                          * Testing out the pressure notification scheme.
1222                          * No need for delays etc.
1223                          */
1224                 } else {
1225                         uint32_t sleep_interval = INTER_NOTIFICATION_DELAY;
1226 #if CONFIG_JETSAM
1227                         unsigned int page_delta = 0;
1228                         unsigned int skip_delay_page_threshold = 0;
1229
1230                         assert(memorystatus_available_pages_pressure >= memorystatus_available_pages_critical_base);
1231
1232                         page_delta = (memorystatus_available_pages_pressure - memorystatus_available_pages_critical_base) / 2;
1233                         skip_delay_page_threshold = memorystatus_available_pages_pressure - page_delta;
1234
1235                         if (memorystatus_available_pages <= skip_delay_page_threshold) {
1236                                 /*
1237                                  * We are nearing the critcal mark fast and can't afford to wait between
1238                                  * notifications.
1239                                  */
1240                                 sleep_interval = 0;
1241                         }
1242 #endif /* CONFIG_JETSAM */
1243
1244                         if (sleep_interval) {
1245                                 delay(sleep_interval);
1246                         }
1247                 }
1248         }
1249
1250         return KERN_SUCCESS;
1251 }
1252
1253 static uint32_t
1254 convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)
1255 {
1256         uint32_t dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
1257
1258         switch (internal_pressure_level) {
1259         case kVMPressureNormal:
1260         {
1261                 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
1262                 break;
1263         }
1264
1265         case kVMPressureWarning:
1266         case kVMPressureUrgent:
1267         {
1268                 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN;
1269                 break;
1270         }
1271
1272         case kVMPressureCritical:
1273         {
1274                 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
1275                 break;
1276         }
1277
1278         default:
1279                 break;
1280         }
1281
1282         return dispatch_level;
1283 }
1284
1285 /*
1286  * Notify any kexts that are waiting for notification that jetsam
1287  * is approaching the foreground bands. They should use this notification
1288  * to free cached memory.
1289  */
1290 void
1291 memorystatus_issue_fg_band_notify(void)
1292 {
1293         uint64_t now;
1294
1295         lck_mtx_lock(&memorystatus_jetsam_fg_band_lock);
1296         absolutetime_to_nanoseconds(mach_absolute_time(), &now);
1297         if (now - memorystatus_jetsam_fg_band_timestamp_ns < memorystatus_jetsam_fg_band_delay_ns) {
1298                 lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
1299                 return;
1300         }
1301
1302         if (memorystatus_jetsam_fg_band_waiters > 0) {
1303                 thread_wakeup(&memorystatus_jetsam_fg_band_waiters);
1304                 memorystatus_jetsam_fg_band_waiters = 0;
1305                 memorystatus_jetsam_fg_band_timestamp_ns = now;
1306         }
1307         lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
1308
1309         /* Notify the buffer cache, file systems, etc. to jetison everything they can. */
1310         if (consider_buffer_cache_collect != NULL) {
1311                 (void)(*consider_buffer_cache_collect)(1);
1312         }
1313 }
1314
1315
1316 /*
1317  * Memorystatus notification debugging support
1318  */
1319
1320 static int
1321 sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS
1322 {
1323 #pragma unused(arg1, arg2, oidp)
1324 #if CONFIG_EMBEDDED
1325         int error = 0;
1326
1327         error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
1328         if (error) {
1329                 return error;
1330         }
1331
1332 #endif /* CONFIG_EMBEDDED */
1333         uint32_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(memorystatus_vm_pressure_level);
1334
1335         return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level));
1336 }
1337
1338 #if DEBUG || DEVELOPMENT
1339
1340 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
1341     0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
1342
1343 #else /* DEBUG || DEVELOPMENT */
1344
1345 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1346     0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
1347
1348 #endif /* DEBUG || DEVELOPMENT */
1349
1350 /*
1351  * Trigger levels to test the mechanism.
1352  * Can be used via a sysctl.
1353  */
1354 #define TEST_LOW_MEMORY_TRIGGER_ONE        1
1355 #define TEST_LOW_MEMORY_TRIGGER_ALL        2
1356 #define TEST_PURGEABLE_TRIGGER_ONE        3
1357 #define TEST_PURGEABLE_TRIGGER_ALL        4
1358 #define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE    5
1359 #define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL    6
1360
1361 static int
1362 sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
1363 {
1364 #pragma unused(arg1, arg2)
1365
1366         int level = 0;
1367         int error = 0;
1368         int pressure_level = 0;
1369         int trigger_request = 0;
1370         int force_purge;
1371
1372         error = sysctl_handle_int(oidp, &level, 0, req);
1373         if (error || !req->newptr) {
1374                 return error;
1375         }
1376
1377         memorystatus_manual_testing_on = TRUE;
1378
1379         trigger_request = (level >> 16) & 0xFFFF;
1380         pressure_level = (level & 0xFFFF);
1381
1382         if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE ||
1383             trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) {
1384                 return EINVAL;
1385         }
1386         switch (pressure_level) {
1387         case NOTE_MEMORYSTATUS_PRESSURE_NORMAL:
1388         case NOTE_MEMORYSTATUS_PRESSURE_WARN:
1389         case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL:
1390                 break;
1391         default:
1392                 return EINVAL;
1393         }
1394
1395         /*
1396          * The pressure level is being set from user-space.
1397          * And user-space uses the constants in sys/event.h
1398          * So we translate those events to our internal levels here.
1399          */
1400         if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1401                 memorystatus_manual_testing_level = kVMPressureNormal;
1402                 force_purge = 0;
1403         } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) {
1404                 memorystatus_manual_testing_level = kVMPressureWarning;
1405                 force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1406         } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
1407                 memorystatus_manual_testing_level = kVMPressureCritical;
1408                 force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1409         }
1410
1411         memorystatus_vm_pressure_level = memorystatus_manual_testing_level;
1412
1413         /* purge according to the new pressure level */
1414         switch (trigger_request) {
1415         case TEST_PURGEABLE_TRIGGER_ONE:
1416         case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE:
1417                 if (force_purge == 0) {
1418                         /* no purging requested */
1419                         break;
1420                 }
1421                 vm_purgeable_object_purge_one_unlocked(force_purge);
1422                 break;
1423         case TEST_PURGEABLE_TRIGGER_ALL:
1424         case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL:
1425                 if (force_purge == 0) {
1426                         /* no purging requested */
1427                         break;
1428                 }
1429                 while (vm_purgeable_object_purge_one_unlocked(force_purge)) {
1430                         ;
1431                 }
1432                 break;
1433         }
1434
1435         if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) ||
1436             (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) {
1437                 memorystatus_update_vm_pressure(TRUE);
1438         }
1439
1440         if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) ||
1441             (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) {
1442                 while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) {
1443                         continue;
1444                 }
1445         }
1446
1447         if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1448                 memorystatus_manual_testing_on = FALSE;
1449         }
1450
1451         return 0;
1452 }
1453
1454 SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1455     0, 0, &sysctl_memorypressure_manual_trigger, "I", "");
1456
1457
1458 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_warning, 0, "");
1459 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_urgent, 0, "");
1460 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_critical, 0, "");
1461
1462 #if DEBUG || DEVELOPMENT
1463 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_events_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_events_enabled, 0, "");
1464
1465 #if 0
1466 #if CONFIG_JETSAM && VM_PRESSURE_EVENTS
1467 static boolean_t
1468 memorystatus_issue_pressure_kevent(boolean_t pressured)
1469 {
1470         memorystatus_klist_lock();
1471         KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure);
1472         memorystatus_klist_unlock();
1473         return TRUE;
1474 }
1475 #endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
1476 #endif /* 0 */
1477
1478 /*
1479  * This routine is used for targeted notifications regardless of system memory pressure
1480  * and regardless of whether or not the process has already been notified.
1481  * It bypasses and has no effect on the only-one-notification per soft-limit policy.
1482  *
1483  * "memnote" is the current user.
1484  */
1485
1486 static int
1487 sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS
1488 {
1489 #pragma unused(arg1, arg2)
1490         /* Need to be root or have memorystatus entitlement */
1491         if (!kauth_cred_issuser(kauth_cred_get()) && !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT)) {
1492                 return EPERM;
1493         }
1494
1495         int error = 0, pid = 0;
1496         struct knote *kn = NULL;
1497         boolean_t found_knote = FALSE;
1498         int fflags = 0;    /* filter flags for EVFILT_MEMORYSTATUS */
1499         uint64_t value = 0;
1500
1501         error = sysctl_handle_quad(oidp, &value, 0, req);
1502         if (error || !req->newptr) {
1503                 return error;
1504         }
1505
1506         /*
1507          * Find the pid in the low 32 bits of value passed in.
1508          */
1509         pid = (int)(value & 0xFFFFFFFF);
1510
1511         /*
1512          * Find notification in the high 32 bits of the value passed in.
1513          */
1514         fflags = (int)((value >> 32) & 0xFFFFFFFF);
1515
1516         /*
1517          * For backwards compatibility, when no notification is
1518          * passed in, default to the NOTE_MEMORYSTATUS_PRESSURE_WARN
1519          */
1520         if (fflags == 0) {
1521                 fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
1522                 // printf("memorystatus_vm_pressure_send: using default notification [0x%x]\n", fflags);
1523         }
1524
1525         /* wake up everybody waiting for kVMPressureJetsam */
1526         if (fflags == NOTE_MEMORYSTATUS_JETSAM_FG_BAND) {
1527                 memorystatus_issue_fg_band_notify();
1528                 return error;
1529         }
1530
1531         /*
1532          * See event.h ... fflags for EVFILT_MEMORYSTATUS
1533          */
1534         if (!((fflags == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) ||
1535             (fflags == NOTE_MEMORYSTATUS_PRESSURE_WARN) ||
1536             (fflags == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) ||
1537             (fflags == NOTE_MEMORYSTATUS_LOW_SWAP) ||
1538             (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) ||
1539             (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) ||
1540             (((fflags & NOTE_MEMORYSTATUS_MSL_STATUS) != 0 &&
1541             ((fflags & ~NOTE_MEMORYSTATUS_MSL_STATUS) == 0))))) {
1542                 printf("memorystatus_vm_pressure_send: notification [0x%x] not supported \n", fflags);
1543                 error = 1;
1544                 return error;
1545         }
1546
1547         /*
1548          * Forcibly send pid a memorystatus notification.
1549          */
1550
1551         memorystatus_klist_lock();
1552
1553         SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1554                 proc_t knote_proc = knote_get_kq(kn)->kq_p;
1555                 pid_t knote_pid = knote_proc->p_pid;
1556
1557                 if (knote_pid == pid) {
1558                         /*
1559                          * Forcibly send this pid a memorystatus notification.
1560                          */
1561                         kn->kn_fflags = fflags;
1562                         found_knote = TRUE;
1563                 }
1564         }
1565
1566         if (found_knote) {
1567                 KNOTE(&memorystatus_klist, 0);
1568                 printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] sent to process [%d] \n", value, fflags, pid);
1569                 error = 0;
1570         } else {
1571                 printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] not sent to process [%d] (none registered?)\n", value, fflags, pid);
1572                 error = 1;
1573         }
1574
1575         memorystatus_klist_unlock();
1576
1577         return error;
1578 }
1579
1580 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_ANYBODY,
1581     0, 0, &sysctl_memorystatus_vm_pressure_send, "Q", "");
1582
1583 #endif /* DEBUG || DEVELOPMENT */
1584
1585 #endif /* VM_PRESSURE_EVENTS */