]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/kern_memorystatus_notify.c
xnu-7195.81.3.tar.gz
[apple/xnu.git] / bsd / kern / kern_memorystatus_notify.c
CommitLineData
cb323159
A
1/*
2 * Copyright (c) 2006-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
29
30#include <sys/kern_event.h>
31#include <kern/sched_prim.h>
cb323159
A
32#include <kern/assert.h>
33#include <kern/debug.h>
34#include <kern/locks.h>
35#include <kern/task.h>
36#include <kern/thread.h>
37#include <kern/host.h>
38#include <kern/policy_internal.h>
39#include <kern/thread_group.h>
40
41#include <IOKit/IOBSD.h>
42
43#include <libkern/libkern.h>
44#include <mach/coalition.h>
45#include <mach/mach_time.h>
46#include <mach/task.h>
47#include <mach/host_priv.h>
48#include <mach/mach_host.h>
49#include <os/log.h>
50#include <pexpert/pexpert.h>
51#include <sys/coalition.h>
52#include <sys/kern_event.h>
53#include <sys/proc.h>
54#include <sys/proc_info.h>
55#include <sys/reason.h>
56#include <sys/signal.h>
57#include <sys/signalvar.h>
58#include <sys/sysctl.h>
59#include <sys/sysproto.h>
60#include <sys/time.h>
61#include <sys/wait.h>
62#include <sys/tree.h>
63#include <sys/priv.h>
64#include <vm/vm_pageout.h>
65#include <vm/vm_protos.h>
66#include <mach/machine/sdt.h>
67#include <libkern/section_keywords.h>
68#include <stdatomic.h>
69
70#if CONFIG_FREEZE
71#include <vm/vm_map.h>
72#endif /* CONFIG_FREEZE */
73
74#include <sys/kern_memorystatus.h>
75#include <sys/kern_memorystatus_notify.h>
76
77/*
78 * Memorystatus klist structures
79 */
80struct klist memorystatus_klist;
81static lck_mtx_t memorystatus_klist_mutex;
82static void memorystatus_klist_lock(void);
83static void memorystatus_klist_unlock(void);
84
85/*
86 * Memorystatus kevent filter routines
87 */
88static int filt_memorystatusattach(struct knote *kn, struct kevent_qos_s *kev);
89static void filt_memorystatusdetach(struct knote *kn);
90static int filt_memorystatus(struct knote *kn, long hint);
91static int filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev);
92static int filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev);
93
94SECURITY_READ_ONLY_EARLY(struct filterops) memorystatus_filtops = {
95 .f_attach = filt_memorystatusattach,
96 .f_detach = filt_memorystatusdetach,
97 .f_event = filt_memorystatus,
98 .f_touch = filt_memorystatustouch,
99 .f_process = filt_memorystatusprocess,
100};
101
102/*
103 * Memorystatus notification events
104 */
105enum {
106 kMemorystatusNoPressure = 0x1,
107 kMemorystatusPressure = 0x2,
108 kMemorystatusLowSwap = 0x4,
109 kMemorystatusProcLimitWarn = 0x8,
110 kMemorystatusProcLimitCritical = 0x10
111};
112
113#define INTER_NOTIFICATION_DELAY (250000) /* .25 second */
114#define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD 5000 /* milliseconds */
115#define WARNING_NOTIFICATION_RESTING_PERIOD 25 /* seconds */
116#define CRITICAL_NOTIFICATION_RESTING_PERIOD 25 /* seconds */
117
118/*
119 * Memorystatus notification helper routines
120 */
121static vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
122static boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
123static void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
124static struct knote *vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process);
125static void vm_dispatch_memory_pressure(void);
126kern_return_t memorystatus_update_vm_pressure(boolean_t target_foreground_process);
127
128#if VM_PRESSURE_EVENTS
129
130/*
131 * This value is the threshold that a process must meet to be considered for scavenging.
132 */
f427ee49 133#if XNU_TARGET_OS_OSX
cb323159 134#define VM_PRESSURE_MINIMUM_RSIZE 10 /* MB */
f427ee49
A
135#else /* XNU_TARGET_OS_OSX */
136#define VM_PRESSURE_MINIMUM_RSIZE 6 /* MB */
137#endif /* XNU_TARGET_OS_OSX */
cb323159
A
138
139static uint32_t vm_pressure_task_footprint_min = VM_PRESSURE_MINIMUM_RSIZE;
140
141#if DEVELOPMENT || DEBUG
142SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_task_footprint_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_task_footprint_min, 0, "");
143#endif /* DEVELOPMENT || DEBUG */
144
145vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal;
146
147/*
148 * We use this flag to signal if we have any HWM offenders
149 * on the system. This way we can reduce the number of wakeups
150 * of the memorystatus_thread when the system is between the
151 * "pressure" and "critical" threshold.
152 *
153 * The (re-)setting of this variable is done without any locks
154 * or synchronization simply because it is not possible (currently)
155 * to keep track of HWM offenders that drop down below their memory
156 * limit and/or exit. So, we choose to burn a couple of wasted wakeups
157 * by allowing the unguarded modification of this variable.
158 */
159boolean_t memorystatus_hwm_candidates = 0;
160
161#endif /* VM_PRESSURE_EVENTS */
162
163#if CONFIG_JETSAM
164
165extern unsigned int memorystatus_available_pages;
166extern unsigned int memorystatus_available_pages_pressure;
167extern unsigned int memorystatus_available_pages_critical;
168extern unsigned int memorystatus_available_pages_critical_base;
169extern unsigned int memorystatus_available_pages_critical_idle_offset;
170
171#else /* CONFIG_JETSAM */
172
173extern uint64_t memorystatus_available_pages;
174extern uint64_t memorystatus_available_pages_pressure;
175extern uint64_t memorystatus_available_pages_critical;
176
177#endif /* CONFIG_JETSAM */
178
179extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
180uint32_t memorystatus_jetsam_fg_band_waiters = 0;
181static uint64_t memorystatus_jetsam_fg_band_timestamp_ns = 0; /* nanosec */
182static uint64_t memorystatus_jetsam_fg_band_delay_ns = 5ull * 1000 * 1000 * 1000; /* nanosec */
183
184extern boolean_t(*volatile consider_buffer_cache_collect)(int);
185
186#if DEVELOPMENT || DEBUG
187SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_jetsam_fg_band_delay_ns, CTLFLAG_RW | CTLFLAG_LOCKED,
188 &memorystatus_jetsam_fg_band_delay_ns, "");
189#endif
190
191static int
192filt_memorystatusattach(struct knote *kn, __unused struct kevent_qos_s *kev)
193{
194 int error;
195
196 kn->kn_flags |= EV_CLEAR; /* automatically set */
197 kn->kn_sdata = 0; /* incoming data is ignored */
198
199 error = memorystatus_knote_register(kn);
200 if (error) {
201 knote_set_error(kn, error);
202 }
203 return 0;
204}
205
206static void
207filt_memorystatusdetach(struct knote *kn)
208{
209 memorystatus_knote_unregister(kn);
210}
211
212static int
213filt_memorystatus(struct knote *kn __unused, long hint)
214{
215 if (hint) {
216 switch (hint) {
217 case kMemorystatusNoPressure:
218 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
219 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
220 }
221 break;
222 case kMemorystatusPressure:
223 if (memorystatus_vm_pressure_level == kVMPressureWarning || memorystatus_vm_pressure_level == kVMPressureUrgent) {
224 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
225 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
226 }
227 } else if (memorystatus_vm_pressure_level == kVMPressureCritical) {
228 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
229 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
230 }
231 }
232 break;
233 case kMemorystatusLowSwap:
234 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) {
235 kn->kn_fflags = NOTE_MEMORYSTATUS_LOW_SWAP;
236 }
237 break;
238
239 case kMemorystatusProcLimitWarn:
240 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
241 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
242 }
243 break;
244
245 case kMemorystatusProcLimitCritical:
246 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
247 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
248 }
249 break;
250
251 default:
252 break;
253 }
254 }
255
256#if 0
257 if (kn->kn_fflags != 0) {
258 proc_t knote_proc = knote_get_kq(kn)->kq_p;
259 pid_t knote_pid = knote_proc->p_pid;
260
261 printf("filt_memorystatus: sending kn 0x%lx (event 0x%x) for pid (%d)\n",
262 (unsigned long)kn, kn->kn_fflags, knote_pid);
263 }
264#endif
265
266 return kn->kn_fflags != 0;
267}
268
269static int
270filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev)
271{
272 int res;
273 int prev_kn_sfflags = 0;
274
275 memorystatus_klist_lock();
276
277 /*
278 * copy in new kevent settings
279 * (saving the "desired" data and fflags).
280 */
281
282 prev_kn_sfflags = kn->kn_sfflags;
283 kn->kn_sfflags = (kev->fflags & EVFILT_MEMORYSTATUS_ALL_MASK);
284
f427ee49 285#if XNU_TARGET_OS_OSX
cb323159
A
286 /*
287 * Only on desktop do we restrict notifications to
288 * one per active/inactive state (soft limits only).
289 */
290 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
291 /*
292 * Is there previous state to preserve?
293 */
294 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
295 /*
296 * This knote was previously interested in proc_limit_warn,
297 * so yes, preserve previous state.
298 */
299 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
300 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
301 }
302 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
303 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
304 }
305 } else {
306 /*
307 * This knote was not previously interested in proc_limit_warn,
308 * but it is now. Set both states.
309 */
310 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
311 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
312 }
313 }
314
315 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
316 /*
317 * Is there previous state to preserve?
318 */
319 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
320 /*
321 * This knote was previously interested in proc_limit_critical,
322 * so yes, preserve previous state.
323 */
324 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
325 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
326 }
327 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
328 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
329 }
330 } else {
331 /*
332 * This knote was not previously interested in proc_limit_critical,
333 * but it is now. Set both states.
334 */
335 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
336 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
337 }
338 }
f427ee49 339#endif /* XNU_TARGET_OS_OSX */
cb323159
A
340
341 /*
342 * reset the output flags based on a
343 * combination of the old events and
344 * the new desired event list.
345 */
346 //kn->kn_fflags &= kn->kn_sfflags;
347
348 res = (kn->kn_fflags != 0);
349
350 memorystatus_klist_unlock();
351
352 return res;
353}
354
355static int
356filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev)
357{
358 int res = 0;
359
360 memorystatus_klist_lock();
361 if (kn->kn_fflags) {
362 knote_fill_kevent(kn, kev, 0);
363 res = 1;
364 }
365 memorystatus_klist_unlock();
366
367 return res;
368}
369
370static void
371memorystatus_klist_lock(void)
372{
373 lck_mtx_lock(&memorystatus_klist_mutex);
374}
375
376static void
377memorystatus_klist_unlock(void)
378{
379 lck_mtx_unlock(&memorystatus_klist_mutex);
380}
381
382void
383memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr)
384{
385 lck_mtx_init(&memorystatus_klist_mutex, grp, attr);
386 klist_init(&memorystatus_klist);
387}
388
389int
390memorystatus_knote_register(struct knote *kn)
391{
392 int error = 0;
393
394 memorystatus_klist_lock();
395
396 /*
397 * Support only userspace visible flags.
398 */
399 if ((kn->kn_sfflags & EVFILT_MEMORYSTATUS_ALL_MASK) == (unsigned int) kn->kn_sfflags) {
f427ee49 400#if XNU_TARGET_OS_OSX
cb323159
A
401 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
402 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
403 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
404 }
405
406 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
407 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
408 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
409 }
f427ee49 410#endif /* XNU_TARGET_OS_OSX */
cb323159
A
411
412 KNOTE_ATTACH(&memorystatus_klist, kn);
413 } else {
414 error = ENOTSUP;
415 }
416
417 memorystatus_klist_unlock();
418
419 return error;
420}
421
422void
423memorystatus_knote_unregister(struct knote *kn __unused)
424{
425 memorystatus_klist_lock();
426 KNOTE_DETACH(&memorystatus_klist, kn);
427 memorystatus_klist_unlock();
428}
429
430#if VM_PRESSURE_EVENTS
431
432#if CONFIG_MEMORYSTATUS
433
f427ee49
A
434static inline int
435memorystatus_send_note_internal(int event_code, int subclass, void *data, uint32_t data_length)
cb323159
A
436{
437 int ret;
438 struct kev_msg ev_msg;
439
440 ev_msg.vendor_code = KEV_VENDOR_APPLE;
441 ev_msg.kev_class = KEV_SYSTEM_CLASS;
f427ee49 442 ev_msg.kev_subclass = subclass;
cb323159
A
443
444 ev_msg.event_code = event_code;
445
446 ev_msg.dv[0].data_length = data_length;
447 ev_msg.dv[0].data_ptr = data;
448 ev_msg.dv[1].data_length = 0;
449
450 ret = kev_post_msg(&ev_msg);
451 if (ret) {
452 printf("%s: kev_post_msg() failed, err %d\n", __func__, ret);
453 }
454
455 return ret;
456}
457
f427ee49
A
458int
459memorystatus_send_note(int event_code, void *data, uint32_t data_length)
460{
461 return memorystatus_send_note_internal(event_code, KEV_MEMORYSTATUS_SUBCLASS, data, data_length);
462}
463
464int
465memorystatus_send_dirty_status_change_note(void *data, uint32_t data_length)
466{
467 return memorystatus_send_note_internal(kDirtyStatusChangeNote, KEV_DIRTYSTATUS_SUBCLASS, data, data_length);
468}
469
cb323159 470boolean_t
f427ee49 471memorystatus_warn_process(const proc_t p, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t limit_exceeded)
cb323159 472{
f427ee49
A
473 /*
474 * This function doesn't take a reference to p or lock it. So it better be the current process.
475 */
476 assert(p == current_proc());
477 pid_t pid = p->p_pid;
cb323159
A
478 boolean_t ret = FALSE;
479 boolean_t found_knote = FALSE;
480 struct knote *kn = NULL;
481 int send_knote_count = 0;
f427ee49
A
482 uint32_t platform;
483 platform = proc_platform(p);
cb323159
A
484
485 /*
486 * See comment in sysctl_memorystatus_vm_pressure_send.
487 */
488
489 memorystatus_klist_lock();
490
491 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
492 proc_t knote_proc = knote_get_kq(kn)->kq_p;
493 pid_t knote_pid = knote_proc->p_pid;
494
495 if (knote_pid == pid) {
496 /*
497 * By setting the "fflags" here, we are forcing
498 * a process to deal with the case where it's
499 * bumping up into its memory limits. If we don't
500 * do this here, we will end up depending on the
501 * system pressure snapshot evaluation in
502 * filt_memorystatus().
503 */
504
f427ee49
A
505 /*
506 * The type of notification and the frequency are different between
507 * embedded and desktop.
508 *
509 * Embedded processes register for global pressure notifications
510 * (NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) via UIKit
511 * (see applicationDidReceiveMemoryWarning in UIKit). We'll warn them here if
512 * they are near there memory limit. filt_memorystatus() will warn them based
513 * on the system pressure level.
514 *
515 * On desktop, (NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL)
516 * are only expected to fire for system level warnings. Desktop procesess
517 * register for NOTE_MEMORYSTATUS_PROC_LIMIT_WARN
518 * if they want to be warned when they approach their limit
519 * and for NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL to be warned when they
520 * exceed their limit.
521 *
522 * On embedded we continuously warn processes that are approaching their
523 * memory limit. However on desktop, we only send one warning while
524 * the process is active/inactive if the limit is soft..
525 *
526 */
527 if (platform == PLATFORM_MACOS || platform == PLATFORM_MACCATALYST || platform == PLATFORM_DRIVERKIT) {
528 if (!limit_exceeded) {
529 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
530 found_knote = TRUE;
531 if (!is_fatal) {
532 /*
533 * Restrict proc_limit_warn notifications when
534 * non-fatal (soft) limit is at play.
535 */
536 if (is_active) {
537 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
538 /*
539 * Mark this knote for delivery.
540 */
541 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
542 /*
543 * And suppress it from future notifications.
544 */
545 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
546 send_knote_count++;
547 }
548 } else {
549 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
550 /*
551 * Mark this knote for delivery.
552 */
553 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
554 /*
555 * And suppress it from future notifications.
556 */
557 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
558 send_knote_count++;
559 }
cb323159
A
560 }
561 } else {
f427ee49
A
562 /*
563 * No restriction on proc_limit_warn notifications when
564 * fatal (hard) limit is at play.
565 */
566 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
567 send_knote_count++;
cb323159 568 }
cb323159 569 }
f427ee49
A
570 } else {
571 /*
572 * Send this notification when a process has exceeded a soft limit,
573 */
574
575 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
576 found_knote = TRUE;
577 if (!is_fatal) {
578 /*
579 * Restrict critical notifications for soft limits.
580 */
581
582 if (is_active) {
583 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
584 /*
585 * Suppress future proc_limit_critical notifications
586 * for the active soft limit.
587 */
588 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
589 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
590 send_knote_count++;
591 }
592 } else {
593 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
594 /*
595 * Suppress future proc_limit_critical_notifications
596 * for the inactive soft limit.
597 */
598 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
599 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
600 send_knote_count++;
601 }
cb323159
A
602 }
603 } else {
f427ee49
A
604 /*
605 * We should never be trying to send a critical notification for
606 * a hard limit... the process would be killed before it could be
607 * received.
608 */
609 panic("Caught sending pid %d a critical warning for a fatal limit.\n", pid);
cb323159 610 }
f427ee49
A
611 }
612 }
613 } else {
614 if (!limit_exceeded) {
615 /*
616 * Intentionally set either the unambiguous limit warning,
617 * the system-wide critical or the system-wide warning
618 * notification bit.
619 */
620
621 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
622 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
623 found_knote = TRUE;
624 send_knote_count++;
625 } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
626 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
627 found_knote = TRUE;
628 send_knote_count++;
629 } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
630 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
631 found_knote = TRUE;
632 send_knote_count++;
633 }
634 } else {
635 /*
636 * Send this notification when a process has exceeded a soft limit.
637 */
638 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
639 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
640 found_knote = TRUE;
641 send_knote_count++;
cb323159
A
642 }
643 }
644 }
cb323159
A
645 }
646 }
647
648 if (found_knote) {
649 if (send_knote_count > 0) {
650 KNOTE(&memorystatus_klist, 0);
651 }
652 ret = TRUE;
653 }
654
655 memorystatus_klist_unlock();
656
657 return ret;
658}
659
660/*
661 * Can only be set by the current task on itself.
662 */
663int
664memorystatus_low_mem_privileged_listener(uint32_t op_flags)
665{
666 boolean_t set_privilege = FALSE;
667 /*
668 * Need an entitlement check here?
669 */
670 if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE) {
671 set_privilege = TRUE;
672 } else if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE) {
673 set_privilege = FALSE;
674 } else {
675 return EINVAL;
676 }
677
678 return task_low_mem_privileged_listener(current_task(), set_privilege, NULL);
679}
680
681int
682memorystatus_send_pressure_note(pid_t pid)
683{
684 MEMORYSTATUS_DEBUG(1, "memorystatus_send_pressure_note(): pid %d\n", pid);
685 return memorystatus_send_note(kMemorystatusPressureNote, &pid, sizeof(pid));
686}
687
688boolean_t
689memorystatus_is_foreground_locked(proc_t p)
690{
691 return (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) ||
692 (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT);
693}
694
695/*
696 * This is meant for stackshot and kperf -- it does not take the proc_list_lock
697 * to access the p_memstat_dirty field.
698 */
699void
700memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit)
701{
702 if (!v) {
703 *is_dirty = FALSE;
704 *is_dirty_tracked = FALSE;
705 *allow_idle_exit = FALSE;
706 } else {
707 proc_t p = (proc_t)v;
708 *is_dirty = (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0;
709 *is_dirty_tracked = (p->p_memstat_dirty & P_DIRTY_TRACK) != 0;
710 *allow_idle_exit = (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) != 0;
711 }
712}
713
714boolean_t
715memorystatus_bg_pressure_eligible(proc_t p)
716{
717 boolean_t eligible = FALSE;
718
719 proc_list_lock();
720
721 MEMORYSTATUS_DEBUG(1, "memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", p->p_pid, p->p_memstat_state);
722
723 /* Foreground processes have already been dealt with at this point, so just test for eligibility */
724 if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) {
725 eligible = TRUE;
726 }
727
728 if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_BACKGROUND_OPPORTUNISTIC) {
729 /*
730 * IDLE and IDLE_DEFERRED bands contain processes
731 * that have dropped memory to be under their inactive
732 * memory limits. And so they can't really give back
733 * anything.
734 */
735 eligible = FALSE;
736 }
737
738 proc_list_unlock();
739
740 return eligible;
741}
742
743void
744memorystatus_send_low_swap_note(void)
745{
746 struct knote *kn = NULL;
747
748 memorystatus_klist_lock();
749 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
750 /* We call is_knote_registered_modify_task_pressure_bits to check if the sfflags for the
751 * current note contain NOTE_MEMORYSTATUS_LOW_SWAP. Once we find one note in the memorystatus_klist
752 * that has the NOTE_MEMORYSTATUS_LOW_SWAP flags in its sfflags set, we call KNOTE with
753 * kMemoryStatusLowSwap as the hint to process and update all knotes on the memorystatus_klist accordingly. */
754 if (is_knote_registered_modify_task_pressure_bits(kn, NOTE_MEMORYSTATUS_LOW_SWAP, NULL, 0, 0) == TRUE) {
755 KNOTE(&memorystatus_klist, kMemorystatusLowSwap);
756 break;
757 }
758 }
759
760 memorystatus_klist_unlock();
761}
762
763#endif /* CONFIG_MEMORYSTATUS */
764
765/*
766 * kn_max - knote
767 *
768 * knote_pressure_level - to check if the knote is registered for this notification level.
769 *
770 * task - task whose bits we'll be modifying
771 *
772 * pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again.
773 *
774 * pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately.
775 *
776 */
777
778static boolean_t
779is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set)
780{
781 if (kn_max->kn_sfflags & knote_pressure_level) {
782 if (pressure_level_to_clear && task_has_been_notified(task, pressure_level_to_clear) == TRUE) {
783 task_clear_has_been_notified(task, pressure_level_to_clear);
784 }
785
786 task_mark_has_been_notified(task, pressure_level_to_set);
787 return TRUE;
788 }
789
790 return FALSE;
791}
792
793static void
794memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear)
795{
796 struct knote *kn = NULL;
797
798 memorystatus_klist_lock();
799 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
800 proc_t p = PROC_NULL;
801 struct task* t = TASK_NULL;
802
803 p = knote_get_kq(kn)->kq_p;
804 proc_list_lock();
805 if (p != proc_ref_locked(p)) {
806 p = PROC_NULL;
807 proc_list_unlock();
808 continue;
809 }
810 proc_list_unlock();
811
812 t = (struct task *)(p->task);
813
814 task_clear_has_been_notified(t, pressure_level_to_clear);
815
816 proc_rele(p);
817 }
818
819 memorystatus_klist_unlock();
820}
821
822/*
823 * Used by the vm_pressure_thread which is
824 * signalled from within vm_pageout_scan().
825 */
826
827void
828consider_vm_pressure_events(void)
829{
830 vm_dispatch_memory_pressure();
831}
832
833static void
834vm_dispatch_memory_pressure(void)
835{
836 memorystatus_update_vm_pressure(FALSE);
837}
838
839static struct knote *
840vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process)
841{
842 struct knote *kn = NULL, *kn_max = NULL;
843 uint64_t resident_max = 0;/* MB */
cb323159
A
844 int selected_task_importance = 0;
845 static int pressure_snapshot = -1;
846 boolean_t pressure_increase = FALSE;
847
848 if (pressure_snapshot == -1) {
849 /*
850 * Initial snapshot.
851 */
852 pressure_snapshot = level;
853 pressure_increase = TRUE;
854 } else {
855 if (level && (level >= pressure_snapshot)) {
856 pressure_increase = TRUE;
857 } else {
858 pressure_increase = FALSE;
859 }
860
861 pressure_snapshot = level;
862 }
863
864 if (pressure_increase == TRUE) {
865 /*
866 * We'll start by considering the largest
867 * unimportant task in our list.
868 */
869 selected_task_importance = INT_MAX;
870 } else {
871 /*
872 * We'll start by considering the largest
873 * important task in our list.
874 */
875 selected_task_importance = 0;
876 }
877
cb323159
A
878 SLIST_FOREACH(kn, candidate_list, kn_selnext) {
879 uint64_t resident_size = 0;/* MB */
880 proc_t p = PROC_NULL;
881 struct task* t = TASK_NULL;
882 int curr_task_importance = 0;
883 boolean_t consider_knote = FALSE;
884 boolean_t privileged_listener = FALSE;
885
886 p = knote_get_kq(kn)->kq_p;
887 proc_list_lock();
888 if (p != proc_ref_locked(p)) {
889 p = PROC_NULL;
890 proc_list_unlock();
891 continue;
892 }
893 proc_list_unlock();
894
895#if CONFIG_MEMORYSTATUS
896 if (target_foreground_process == TRUE && !memorystatus_is_foreground_locked(p)) {
897 /*
898 * Skip process not marked foreground.
899 */
900 proc_rele(p);
901 continue;
902 }
903#endif /* CONFIG_MEMORYSTATUS */
904
905 t = (struct task *)(p->task);
906
cb323159
A
907 vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(level);
908
909 if ((kn->kn_sfflags & dispatch_level) == 0) {
910 proc_rele(p);
911 continue;
912 }
913
914#if CONFIG_MEMORYSTATUS
915 if (target_foreground_process == FALSE && !memorystatus_bg_pressure_eligible(p)) {
916 VM_PRESSURE_DEBUG(1, "[vm_pressure] skipping process %d\n", p->p_pid);
917 proc_rele(p);
918 continue;
919 }
920#endif /* CONFIG_MEMORYSTATUS */
921
f427ee49 922#if XNU_TARGET_OS_OSX
cb323159 923 curr_task_importance = task_importance_estimate(t);
f427ee49
A
924#else /* XNU_TARGET_OS_OSX */
925 curr_task_importance = p->p_memstat_effectivepriority;
926#endif /* XNU_TARGET_OS_OSX */
cb323159
A
927
928 /*
929 * Privileged listeners are only considered in the multi-level pressure scheme
930 * AND only if the pressure is increasing.
931 */
932 if (level > 0) {
933 if (task_has_been_notified(t, level) == FALSE) {
934 /*
935 * Is this a privileged listener?
936 */
937 if (task_low_mem_privileged_listener(t, FALSE, &privileged_listener) == 0) {
938 if (privileged_listener) {
939 kn_max = kn;
940 proc_rele(p);
941 goto done_scanning;
942 }
943 }
944 } else {
945 proc_rele(p);
946 continue;
947 }
948 } else if (level == 0) {
949 /*
950 * Task wasn't notified when the pressure was increasing and so
951 * no need to notify it that the pressure is decreasing.
952 */
953 if ((task_has_been_notified(t, kVMPressureWarning) == FALSE) && (task_has_been_notified(t, kVMPressureCritical) == FALSE)) {
954 proc_rele(p);
955 continue;
956 }
957 }
958
959 /*
960 * We don't want a small process to block large processes from
961 * being notified again. <rdar://problem/7955532>
962 */
963 resident_size = (get_task_phys_footprint(t)) / (1024 * 1024ULL); /* MB */
964
965 if (resident_size >= vm_pressure_task_footprint_min) {
966 if (level > 0) {
967 /*
968 * Warning or Critical Pressure.
969 */
970 if (pressure_increase) {
971 if ((curr_task_importance < selected_task_importance) ||
972 ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
973 /*
974 * We have found a candidate process which is:
975 * a) at a lower importance than the current selected process
976 * OR
977 * b) has importance equal to that of the current selected process but is larger
978 */
979
980 consider_knote = TRUE;
981 }
982 } else {
983 if ((curr_task_importance > selected_task_importance) ||
984 ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
985 /*
986 * We have found a candidate process which is:
987 * a) at a higher importance than the current selected process
988 * OR
989 * b) has importance equal to that of the current selected process but is larger
990 */
991
992 consider_knote = TRUE;
993 }
994 }
995 } else if (level == 0) {
996 /*
997 * Pressure back to normal.
998 */
999 if ((curr_task_importance > selected_task_importance) ||
1000 ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1001 consider_knote = TRUE;
1002 }
1003 }
1004
1005 if (consider_knote) {
1006 resident_max = resident_size;
1007 kn_max = kn;
1008 selected_task_importance = curr_task_importance;
1009 consider_knote = FALSE; /* reset for the next candidate */
1010 }
1011 } else {
1012 /* There was no candidate with enough resident memory to scavenge */
1013 VM_PRESSURE_DEBUG(0, "[vm_pressure] threshold failed for pid %d with %llu resident...\n", p->p_pid, resident_size);
1014 }
1015 proc_rele(p);
1016 }
1017
1018done_scanning:
1019 if (kn_max) {
1020 VM_DEBUG_CONSTANT_EVENT(vm_pressure_event, VM_PRESSURE_EVENT, DBG_FUNC_NONE, knote_get_kq(kn_max)->kq_p->p_pid, resident_max, 0, 0);
1021 VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %llu resident\n", knote_get_kq(kn_max)->kq_p->p_pid, resident_max);
1022 }
1023
1024 return kn_max;
1025}
1026
1027static uint64_t next_warning_notification_sent_at_ts = 0;
1028static uint64_t next_critical_notification_sent_at_ts = 0;
1029
1030boolean_t memorystatus_manual_testing_on = FALSE;
1031vm_pressure_level_t memorystatus_manual_testing_level = kVMPressureNormal;
1032
1033kern_return_t
1034memorystatus_update_vm_pressure(boolean_t target_foreground_process)
1035{
1036 struct knote *kn_max = NULL;
1037 struct knote *kn_cur = NULL, *kn_temp = NULL;/* for safe list traversal */
1038 pid_t target_pid = -1;
1039 struct klist dispatch_klist = { NULL };
1040 proc_t target_proc = PROC_NULL;
1041 struct task *task = NULL;
1042 boolean_t found_candidate = FALSE;
1043
1044 static vm_pressure_level_t level_snapshot = kVMPressureNormal;
1045 static vm_pressure_level_t prev_level_snapshot = kVMPressureNormal;
1046 boolean_t smoothing_window_started = FALSE;
1047 struct timeval smoothing_window_start_tstamp = {0, 0};
1048 struct timeval curr_tstamp = {0, 0};
f427ee49 1049 int64_t elapsed_msecs = 0;
cb323159
A
1050 uint64_t curr_ts = mach_absolute_time();
1051
1052#if !CONFIG_JETSAM
1053#define MAX_IDLE_KILLS 100 /* limit the number of idle kills allowed */
1054
1055 int idle_kill_counter = 0;
1056
1057 /*
1058 * On desktop we take this opportunity to free up memory pressure
1059 * by immediately killing idle exitable processes. We use a delay
1060 * to avoid overkill. And we impose a max counter as a fail safe
1061 * in case daemons re-launch too fast.
1062 */
1063 while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) {
1064 if (memorystatus_idle_exit_from_VM() == FALSE) {
1065 /* No idle exitable processes left to kill */
1066 break;
1067 }
1068 idle_kill_counter++;
1069
1070 if (memorystatus_manual_testing_on == TRUE) {
1071 /*
1072 * Skip the delay when testing
1073 * the pressure notification scheme.
1074 */
1075 } else {
1076 delay(1000000); /* 1 second */
1077 }
1078 }
1079#endif /* !CONFIG_JETSAM */
1080
1081 if (level_snapshot != kVMPressureNormal) {
1082 /*
1083 * Check to see if we are still in the 'resting' period
1084 * after having notified all clients interested in
1085 * a particular pressure level.
1086 */
1087
1088 level_snapshot = memorystatus_vm_pressure_level;
1089
1090 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1091 if (next_warning_notification_sent_at_ts) {
1092 if (curr_ts < next_warning_notification_sent_at_ts) {
1093 delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
1094 return KERN_SUCCESS;
1095 }
1096
1097 next_warning_notification_sent_at_ts = 0;
1098 memorystatus_klist_reset_all_for_level(kVMPressureWarning);
1099 }
1100 } else if (level_snapshot == kVMPressureCritical) {
1101 if (next_critical_notification_sent_at_ts) {
1102 if (curr_ts < next_critical_notification_sent_at_ts) {
1103 delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
1104 return KERN_SUCCESS;
1105 }
1106 next_critical_notification_sent_at_ts = 0;
1107 memorystatus_klist_reset_all_for_level(kVMPressureCritical);
1108 }
1109 }
1110 }
1111
1112 while (1) {
1113 /*
1114 * There is a race window here. But it's not clear
1115 * how much we benefit from having extra synchronization.
1116 */
1117 level_snapshot = memorystatus_vm_pressure_level;
1118
1119 if (prev_level_snapshot > level_snapshot) {
1120 /*
1121 * Pressure decreased? Let's take a little breather
1122 * and see if this condition stays.
1123 */
1124 if (smoothing_window_started == FALSE) {
1125 smoothing_window_started = TRUE;
1126 microuptime(&smoothing_window_start_tstamp);
1127 }
1128
1129 microuptime(&curr_tstamp);
1130 timevalsub(&curr_tstamp, &smoothing_window_start_tstamp);
1131 elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
1132
1133 if (elapsed_msecs < VM_PRESSURE_DECREASED_SMOOTHING_PERIOD) {
1134 delay(INTER_NOTIFICATION_DELAY);
1135 continue;
1136 }
1137 }
1138
1139 prev_level_snapshot = level_snapshot;
1140 smoothing_window_started = FALSE;
1141
1142 memorystatus_klist_lock();
1143 kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot, target_foreground_process);
1144
1145 if (kn_max == NULL) {
1146 memorystatus_klist_unlock();
1147
1148 /*
1149 * No more level-based clients to notify.
1150 *
1151 * Start the 'resting' window within which clients will not be re-notified.
1152 */
1153
1154 if (level_snapshot != kVMPressureNormal) {
1155 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1156 nanoseconds_to_absolutetime(WARNING_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
1157
1158 /* Next warning notification (if nothing changes) won't be sent before...*/
1159 next_warning_notification_sent_at_ts = mach_absolute_time() + curr_ts;
1160 }
1161
1162 if (level_snapshot == kVMPressureCritical) {
1163 nanoseconds_to_absolutetime(CRITICAL_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
1164
1165 /* Next critical notification (if nothing changes) won't be sent before...*/
1166 next_critical_notification_sent_at_ts = mach_absolute_time() + curr_ts;
1167 }
1168 }
1169 return KERN_FAILURE;
1170 }
1171
1172 target_proc = knote_get_kq(kn_max)->kq_p;
1173
1174 proc_list_lock();
1175 if (target_proc != proc_ref_locked(target_proc)) {
1176 target_proc = PROC_NULL;
1177 proc_list_unlock();
1178 memorystatus_klist_unlock();
1179 continue;
1180 }
1181 proc_list_unlock();
1182
1183 target_pid = target_proc->p_pid;
1184
1185 task = (struct task *)(target_proc->task);
1186
1187 if (level_snapshot != kVMPressureNormal) {
1188 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1189 if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, 0, kVMPressureWarning) == TRUE) {
1190 found_candidate = TRUE;
1191 }
1192 } else {
1193 if (level_snapshot == kVMPressureCritical) {
1194 if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, 0, kVMPressureCritical) == TRUE) {
1195 found_candidate = TRUE;
1196 }
1197 }
1198 }
1199 } else {
1200 if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1201 task_clear_has_been_notified(task, kVMPressureWarning);
1202 task_clear_has_been_notified(task, kVMPressureCritical);
1203
1204 found_candidate = TRUE;
1205 }
1206 }
1207
1208 if (found_candidate == FALSE) {
1209 proc_rele(target_proc);
1210 memorystatus_klist_unlock();
1211 continue;
1212 }
1213
1214 SLIST_FOREACH_SAFE(kn_cur, &memorystatus_klist, kn_selnext, kn_temp) {
1215 int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level_snapshot);
1216
1217 if (is_knote_registered_modify_task_pressure_bits(kn_cur, knote_pressure_level, task, 0, level_snapshot) == TRUE) {
1218 proc_t knote_proc = knote_get_kq(kn_cur)->kq_p;
1219 pid_t knote_pid = knote_proc->p_pid;
1220 if (knote_pid == target_pid) {
1221 KNOTE_DETACH(&memorystatus_klist, kn_cur);
1222 KNOTE_ATTACH(&dispatch_klist, kn_cur);
1223 }
1224 }
1225 }
1226
1227 KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure);
1228
1229 SLIST_FOREACH_SAFE(kn_cur, &dispatch_klist, kn_selnext, kn_temp) {
1230 KNOTE_DETACH(&dispatch_klist, kn_cur);
1231 KNOTE_ATTACH(&memorystatus_klist, kn_cur);
1232 }
1233
1234 memorystatus_klist_unlock();
1235
1236 microuptime(&target_proc->vm_pressure_last_notify_tstamp);
1237 proc_rele(target_proc);
1238
1239 if (memorystatus_manual_testing_on == TRUE && target_foreground_process == TRUE) {
1240 break;
1241 }
1242
1243 if (memorystatus_manual_testing_on == TRUE) {
1244 /*
1245 * Testing out the pressure notification scheme.
1246 * No need for delays etc.
1247 */
1248 } else {
1249 uint32_t sleep_interval = INTER_NOTIFICATION_DELAY;
1250#if CONFIG_JETSAM
1251 unsigned int page_delta = 0;
1252 unsigned int skip_delay_page_threshold = 0;
1253
1254 assert(memorystatus_available_pages_pressure >= memorystatus_available_pages_critical_base);
1255
1256 page_delta = (memorystatus_available_pages_pressure - memorystatus_available_pages_critical_base) / 2;
1257 skip_delay_page_threshold = memorystatus_available_pages_pressure - page_delta;
1258
1259 if (memorystatus_available_pages <= skip_delay_page_threshold) {
1260 /*
1261 * We are nearing the critcal mark fast and can't afford to wait between
1262 * notifications.
1263 */
1264 sleep_interval = 0;
1265 }
1266#endif /* CONFIG_JETSAM */
1267
1268 if (sleep_interval) {
1269 delay(sleep_interval);
1270 }
1271 }
1272 }
1273
1274 return KERN_SUCCESS;
1275}
1276
1277static uint32_t
1278convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)
1279{
1280 uint32_t dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
1281
1282 switch (internal_pressure_level) {
1283 case kVMPressureNormal:
1284 {
1285 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
1286 break;
1287 }
1288
1289 case kVMPressureWarning:
1290 case kVMPressureUrgent:
1291 {
1292 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN;
1293 break;
1294 }
1295
1296 case kVMPressureCritical:
1297 {
1298 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
1299 break;
1300 }
1301
1302 default:
1303 break;
1304 }
1305
1306 return dispatch_level;
1307}
1308
1309/*
1310 * Notify any kexts that are waiting for notification that jetsam
1311 * is approaching the foreground bands. They should use this notification
1312 * to free cached memory.
1313 */
1314void
1315memorystatus_issue_fg_band_notify(void)
1316{
1317 uint64_t now;
1318
1319 lck_mtx_lock(&memorystatus_jetsam_fg_band_lock);
1320 absolutetime_to_nanoseconds(mach_absolute_time(), &now);
1321 if (now - memorystatus_jetsam_fg_band_timestamp_ns < memorystatus_jetsam_fg_band_delay_ns) {
1322 lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
1323 return;
1324 }
1325
1326 if (memorystatus_jetsam_fg_band_waiters > 0) {
1327 thread_wakeup(&memorystatus_jetsam_fg_band_waiters);
1328 memorystatus_jetsam_fg_band_waiters = 0;
1329 memorystatus_jetsam_fg_band_timestamp_ns = now;
1330 }
1331 lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
1332
1333 /* Notify the buffer cache, file systems, etc. to jetison everything they can. */
1334 if (consider_buffer_cache_collect != NULL) {
1335 (void)(*consider_buffer_cache_collect)(1);
1336 }
1337}
1338
1339
1340/*
1341 * Memorystatus notification debugging support
1342 */
1343
1344static int
1345sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS
1346{
1347#pragma unused(arg1, arg2, oidp)
f427ee49 1348#if !XNU_TARGET_OS_OSX
cb323159
A
1349 int error = 0;
1350
1351 error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
1352 if (error) {
1353 return error;
1354 }
1355
f427ee49 1356#endif /* !XNU_TARGET_OS_OSX */
cb323159
A
1357 uint32_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(memorystatus_vm_pressure_level);
1358
1359 return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level));
1360}
1361
1362#if DEBUG || DEVELOPMENT
1363
1364SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
1365 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
1366
1367#else /* DEBUG || DEVELOPMENT */
1368
1369SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1370 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
1371
1372#endif /* DEBUG || DEVELOPMENT */
1373
1374/*
1375 * Trigger levels to test the mechanism.
1376 * Can be used via a sysctl.
1377 */
1378#define TEST_LOW_MEMORY_TRIGGER_ONE 1
1379#define TEST_LOW_MEMORY_TRIGGER_ALL 2
1380#define TEST_PURGEABLE_TRIGGER_ONE 3
1381#define TEST_PURGEABLE_TRIGGER_ALL 4
1382#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE 5
1383#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL 6
1384
1385static int
1386sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
1387{
1388#pragma unused(arg1, arg2)
1389
1390 int level = 0;
1391 int error = 0;
1392 int pressure_level = 0;
1393 int trigger_request = 0;
1394 int force_purge;
1395
1396 error = sysctl_handle_int(oidp, &level, 0, req);
1397 if (error || !req->newptr) {
1398 return error;
1399 }
1400
1401 memorystatus_manual_testing_on = TRUE;
1402
1403 trigger_request = (level >> 16) & 0xFFFF;
1404 pressure_level = (level & 0xFFFF);
1405
1406 if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE ||
1407 trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) {
1408 return EINVAL;
1409 }
1410 switch (pressure_level) {
1411 case NOTE_MEMORYSTATUS_PRESSURE_NORMAL:
1412 case NOTE_MEMORYSTATUS_PRESSURE_WARN:
1413 case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL:
1414 break;
1415 default:
1416 return EINVAL;
1417 }
1418
1419 /*
1420 * The pressure level is being set from user-space.
1421 * And user-space uses the constants in sys/event.h
1422 * So we translate those events to our internal levels here.
1423 */
1424 if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1425 memorystatus_manual_testing_level = kVMPressureNormal;
1426 force_purge = 0;
1427 } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) {
1428 memorystatus_manual_testing_level = kVMPressureWarning;
1429 force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1430 } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
1431 memorystatus_manual_testing_level = kVMPressureCritical;
1432 force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1433 }
1434
1435 memorystatus_vm_pressure_level = memorystatus_manual_testing_level;
1436
1437 /* purge according to the new pressure level */
1438 switch (trigger_request) {
1439 case TEST_PURGEABLE_TRIGGER_ONE:
1440 case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE:
1441 if (force_purge == 0) {
1442 /* no purging requested */
1443 break;
1444 }
1445 vm_purgeable_object_purge_one_unlocked(force_purge);
1446 break;
1447 case TEST_PURGEABLE_TRIGGER_ALL:
1448 case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL:
1449 if (force_purge == 0) {
1450 /* no purging requested */
1451 break;
1452 }
1453 while (vm_purgeable_object_purge_one_unlocked(force_purge)) {
1454 ;
1455 }
1456 break;
1457 }
1458
1459 if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) ||
1460 (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) {
1461 memorystatus_update_vm_pressure(TRUE);
1462 }
1463
1464 if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) ||
1465 (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) {
1466 while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) {
1467 continue;
1468 }
1469 }
1470
1471 if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1472 memorystatus_manual_testing_on = FALSE;
1473 }
1474
1475 return 0;
1476}
1477
1478SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1479 0, 0, &sysctl_memorypressure_manual_trigger, "I", "");
1480
1481
1482SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_warning, 0, "");
1483SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_urgent, 0, "");
1484SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_critical, 0, "");
1485
1486#if DEBUG || DEVELOPMENT
1487SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_events_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_events_enabled, 0, "");
1488
1489#if 0
1490#if CONFIG_JETSAM && VM_PRESSURE_EVENTS
1491static boolean_t
1492memorystatus_issue_pressure_kevent(boolean_t pressured)
1493{
1494 memorystatus_klist_lock();
1495 KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure);
1496 memorystatus_klist_unlock();
1497 return TRUE;
1498}
1499#endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
1500#endif /* 0 */
1501
1502/*
1503 * This routine is used for targeted notifications regardless of system memory pressure
1504 * and regardless of whether or not the process has already been notified.
1505 * It bypasses and has no effect on the only-one-notification per soft-limit policy.
1506 *
1507 * "memnote" is the current user.
1508 */
1509
1510static int
1511sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS
1512{
1513#pragma unused(arg1, arg2)
1514 /* Need to be root or have memorystatus entitlement */
1515 if (!kauth_cred_issuser(kauth_cred_get()) && !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT)) {
1516 return EPERM;
1517 }
1518
1519 int error = 0, pid = 0;
1520 struct knote *kn = NULL;
1521 boolean_t found_knote = FALSE;
1522 int fflags = 0; /* filter flags for EVFILT_MEMORYSTATUS */
1523 uint64_t value = 0;
1524
1525 error = sysctl_handle_quad(oidp, &value, 0, req);
1526 if (error || !req->newptr) {
1527 return error;
1528 }
1529
1530 /*
1531 * Find the pid in the low 32 bits of value passed in.
1532 */
1533 pid = (int)(value & 0xFFFFFFFF);
1534
1535 /*
1536 * Find notification in the high 32 bits of the value passed in.
1537 */
1538 fflags = (int)((value >> 32) & 0xFFFFFFFF);
1539
1540 /*
1541 * For backwards compatibility, when no notification is
1542 * passed in, default to the NOTE_MEMORYSTATUS_PRESSURE_WARN
1543 */
1544 if (fflags == 0) {
1545 fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
1546 // printf("memorystatus_vm_pressure_send: using default notification [0x%x]\n", fflags);
1547 }
1548
1549 /* wake up everybody waiting for kVMPressureJetsam */
1550 if (fflags == NOTE_MEMORYSTATUS_JETSAM_FG_BAND) {
1551 memorystatus_issue_fg_band_notify();
1552 return error;
1553 }
1554
1555 /*
1556 * See event.h ... fflags for EVFILT_MEMORYSTATUS
1557 */
1558 if (!((fflags == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) ||
1559 (fflags == NOTE_MEMORYSTATUS_PRESSURE_WARN) ||
1560 (fflags == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) ||
1561 (fflags == NOTE_MEMORYSTATUS_LOW_SWAP) ||
1562 (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) ||
1563 (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) ||
1564 (((fflags & NOTE_MEMORYSTATUS_MSL_STATUS) != 0 &&
1565 ((fflags & ~NOTE_MEMORYSTATUS_MSL_STATUS) == 0))))) {
1566 printf("memorystatus_vm_pressure_send: notification [0x%x] not supported \n", fflags);
1567 error = 1;
1568 return error;
1569 }
1570
1571 /*
1572 * Forcibly send pid a memorystatus notification.
1573 */
1574
1575 memorystatus_klist_lock();
1576
1577 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1578 proc_t knote_proc = knote_get_kq(kn)->kq_p;
1579 pid_t knote_pid = knote_proc->p_pid;
1580
1581 if (knote_pid == pid) {
1582 /*
1583 * Forcibly send this pid a memorystatus notification.
1584 */
1585 kn->kn_fflags = fflags;
1586 found_knote = TRUE;
1587 }
1588 }
1589
1590 if (found_knote) {
1591 KNOTE(&memorystatus_klist, 0);
1592 printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] sent to process [%d] \n", value, fflags, pid);
1593 error = 0;
1594 } else {
1595 printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] not sent to process [%d] (none registered?)\n", value, fflags, pid);
1596 error = 1;
1597 }
1598
1599 memorystatus_klist_unlock();
1600
1601 return error;
1602}
1603
1604SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_ANYBODY,
1605 0, 0, &sysctl_memorystatus_vm_pressure_send, "Q", "");
1606
1607#endif /* DEBUG || DEVELOPMENT */
1608
1609#endif /* VM_PRESSURE_EVENTS */