]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/kern_memorystatus.c
xnu-4570.31.3.tar.gz
[apple/xnu.git] / bsd / kern / kern_memorystatus.c
CommitLineData
2d21ac55
A
1/*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
2d21ac55 29
2d21ac55 30#include <kern/sched_prim.h>
6d2010ae 31#include <kern/kalloc.h>
316670eb 32#include <kern/assert.h>
6d2010ae 33#include <kern/debug.h>
fe8ab488 34#include <kern/locks.h>
2d21ac55
A
35#include <kern/task.h>
36#include <kern/thread.h>
316670eb 37#include <kern/host.h>
39037602 38#include <kern/policy_internal.h>
5ba3f43e 39#include <kern/thread_group.h>
39037602
A
40
41#include <IOKit/IOBSD.h>
42
2d21ac55 43#include <libkern/libkern.h>
3e170ce0 44#include <mach/coalition.h>
316670eb 45#include <mach/mach_time.h>
b0d623f7 46#include <mach/task.h>
316670eb 47#include <mach/host_priv.h>
39236c6e 48#include <mach/mach_host.h>
5ba3f43e 49#include <os/log.h>
39236c6e 50#include <pexpert/pexpert.h>
3e170ce0 51#include <sys/coalition.h>
316670eb 52#include <sys/kern_event.h>
b0d623f7 53#include <sys/proc.h>
39236c6e 54#include <sys/proc_info.h>
39037602 55#include <sys/reason.h>
b0d623f7
A
56#include <sys/signal.h>
57#include <sys/signalvar.h>
2d21ac55 58#include <sys/sysctl.h>
316670eb 59#include <sys/sysproto.h>
b0d623f7 60#include <sys/wait.h>
6d2010ae 61#include <sys/tree.h>
316670eb 62#include <sys/priv.h>
39236c6e
A
63#include <vm/vm_pageout.h>
64#include <vm/vm_protos.h>
6d2010ae
A
65
66#if CONFIG_FREEZE
6d2010ae 67#include <vm/vm_map.h>
39236c6e 68#endif /* CONFIG_FREEZE */
6d2010ae 69
316670eb 70#include <sys/kern_memorystatus.h>
6d2010ae 71
39037602 72#include <mach/machine/sdt.h>
5ba3f43e 73#include <libkern/section_keywords.h>
39037602 74
fe8ab488 75/* For logging clarity */
5ba3f43e 76static const char *memorystatus_kill_cause_name[] = {
fe8ab488
A
77 "" ,
78 "jettisoned" , /* kMemorystatusKilled */
79 "highwater" , /* kMemorystatusKilledHiwat */
80 "vnode-limit" , /* kMemorystatusKilledVnodes */
81 "vm-pageshortage" , /* kMemorystatusKilledVMPageShortage */
82 "vm-thrashing" , /* kMemorystatusKilledVMThrashing */
83 "fc-thrashing" , /* kMemorystatusKilledFCThrashing */
84 "per-process-limit" , /* kMemorystatusKilledPerProcessLimit */
85 "diagnostic" , /* kMemorystatusKilledDiagnostic */
86 "idle-exit" , /* kMemorystatusKilledIdleExit */
5ba3f43e 87 "zone-map-exhaustion" , /* kMemorystatusKilledZoneMapExhaustion */
fe8ab488
A
88};
89
5ba3f43e
A
90static const char *
91memorystatus_priority_band_name(int32_t priority)
92{
93 switch (priority) {
94 case JETSAM_PRIORITY_FOREGROUND:
95 return "FOREGROUND";
96 case JETSAM_PRIORITY_AUDIO_AND_ACCESSORY:
97 return "AUDIO_AND_ACCESSORY";
98 case JETSAM_PRIORITY_CONDUCTOR:
99 return "CONDUCTOR";
100 case JETSAM_PRIORITY_HOME:
101 return "HOME";
102 case JETSAM_PRIORITY_EXECUTIVE:
103 return "EXECUTIVE";
104 case JETSAM_PRIORITY_IMPORTANT:
105 return "IMPORTANT";
106 case JETSAM_PRIORITY_CRITICAL:
107 return "CRITICAL";
108 }
109
110 return ("?");
111}
112
fe8ab488
A
113/* Does cause indicate vm or fc thrashing? */
114static boolean_t
5ba3f43e 115is_reason_thrashing(unsigned cause)
fe8ab488
A
116{
117 switch (cause) {
118 case kMemorystatusKilledVMThrashing:
119 case kMemorystatusKilledFCThrashing:
120 return TRUE;
121 default:
122 return FALSE;
123 }
124}
125
5ba3f43e
A
126/* Is the zone map almost full? */
127static boolean_t
128is_reason_zone_map_exhaustion(unsigned cause)
129{
130 if (cause == kMemorystatusKilledZoneMapExhaustion)
131 return TRUE;
132 return FALSE;
133}
134
135/*
136 * Returns the current zone map size and capacity to include in the jetsam snapshot.
137 * Defined in zalloc.c
138 */
139extern void get_zone_map_size(uint64_t *current_size, uint64_t *capacity);
140
141/*
142 * Returns the name of the largest zone and its size to include in the jetsam snapshot.
143 * Defined in zalloc.c
144 */
145extern void get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size);
fe8ab488 146
316670eb
A
147/* These are very verbose printfs(), enable with
148 * MEMORYSTATUS_DEBUG_LOG
149 */
150#if MEMORYSTATUS_DEBUG_LOG
151#define MEMORYSTATUS_DEBUG(cond, format, ...) \
152do { \
153 if (cond) { printf(format, ##__VA_ARGS__); } \
154} while(0)
155#else
156#define MEMORYSTATUS_DEBUG(cond, format, ...)
157#endif
6d2010ae 158
3e170ce0
A
159/*
160 * Active / Inactive limit support
161 * proc list must be locked
162 *
163 * The SET_*** macros are used to initialize a limit
164 * for the first time.
165 *
166 * The CACHE_*** macros are use to cache the limit that will
167 * soon be in effect down in the ledgers.
168 */
169
170#define SET_ACTIVE_LIMITS_LOCKED(p, limit, is_fatal) \
171MACRO_BEGIN \
172(p)->p_memstat_memlimit_active = (limit); \
3e170ce0
A
173 if (is_fatal) { \
174 (p)->p_memstat_state |= P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL; \
175 } else { \
176 (p)->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL; \
177 } \
178MACRO_END
179
180#define SET_INACTIVE_LIMITS_LOCKED(p, limit, is_fatal) \
181MACRO_BEGIN \
182(p)->p_memstat_memlimit_inactive = (limit); \
3e170ce0
A
183 if (is_fatal) { \
184 (p)->p_memstat_state |= P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL; \
185 } else { \
186 (p)->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL; \
187 } \
188MACRO_END
189
813fb2f6 190#define CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal) \
3e170ce0
A
191MACRO_BEGIN \
192(p)->p_memstat_memlimit = (p)->p_memstat_memlimit_active; \
193 if ((p)->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL) { \
194 (p)->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT; \
813fb2f6 195 is_fatal = TRUE; \
3e170ce0
A
196 } else { \
197 (p)->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT; \
813fb2f6 198 is_fatal = FALSE; \
3e170ce0
A
199 } \
200MACRO_END
201
813fb2f6 202#define CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal) \
3e170ce0
A
203MACRO_BEGIN \
204(p)->p_memstat_memlimit = (p)->p_memstat_memlimit_inactive; \
205 if ((p)->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) { \
206 (p)->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT; \
813fb2f6 207 is_fatal = TRUE; \
3e170ce0
A
208 } else { \
209 (p)->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT; \
813fb2f6 210 is_fatal = FALSE; \
3e170ce0
A
211 } \
212MACRO_END
213
214
39236c6e
A
215/* General tunables */
216
217unsigned long delta_percentage = 5;
218unsigned long critical_threshold_percentage = 5;
219unsigned long idle_offset_percentage = 5;
220unsigned long pressure_threshold_percentage = 15;
221unsigned long freeze_threshold_percentage = 50;
39037602 222unsigned long policy_more_free_offset_percentage = 5;
39236c6e 223
316670eb 224/* General memorystatus stuff */
6d2010ae 225
39236c6e
A
226struct klist memorystatus_klist;
227static lck_mtx_t memorystatus_klist_mutex;
6d2010ae 228
39236c6e
A
229static void memorystatus_klist_lock(void);
230static void memorystatus_klist_unlock(void);
6d2010ae 231
39037602
A
232static uint64_t memorystatus_sysprocs_idle_delay_time = 0;
233static uint64_t memorystatus_apps_idle_delay_time = 0;
39236c6e
A
234
235/*
236 * Memorystatus kevents
237 */
238
5ba3f43e 239static int filt_memorystatusattach(struct knote *kn, struct kevent_internal_s *kev);
39236c6e
A
240static void filt_memorystatusdetach(struct knote *kn);
241static int filt_memorystatus(struct knote *kn, long hint);
39037602
A
242static int filt_memorystatustouch(struct knote *kn, struct kevent_internal_s *kev);
243static int filt_memorystatusprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
39236c6e 244
5ba3f43e 245SECURITY_READ_ONLY_EARLY(struct filterops) memorystatus_filtops = {
39236c6e
A
246 .f_attach = filt_memorystatusattach,
247 .f_detach = filt_memorystatusdetach,
248 .f_event = filt_memorystatus,
39037602
A
249 .f_touch = filt_memorystatustouch,
250 .f_process = filt_memorystatusprocess,
39236c6e
A
251};
252
253enum {
fe8ab488
A
254 kMemorystatusNoPressure = 0x1,
255 kMemorystatusPressure = 0x2,
39037602
A
256 kMemorystatusLowSwap = 0x4,
257 kMemorystatusProcLimitWarn = 0x8,
258 kMemorystatusProcLimitCritical = 0x10
39236c6e
A
259};
260
261/* Idle guard handling */
262
39037602
A
263static int32_t memorystatus_scheduled_idle_demotions_sysprocs = 0;
264static int32_t memorystatus_scheduled_idle_demotions_apps = 0;
39236c6e
A
265
266static thread_call_t memorystatus_idle_demotion_call;
267
268static void memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2);
269static void memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state);
270static void memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clean_state);
271static void memorystatus_reschedule_idle_demotion_locked(void);
6d2010ae 272
39037602
A
273static void memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert, boolean_t skip_demotion_check);
274
5ba3f43e
A
275int memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap);
276
39037602 277vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
fe8ab488
A
278
279boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
39037602 280void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
fe8ab488 281void memorystatus_send_low_swap_note(void);
39236c6e
A
282
283int memorystatus_wakeup = 0;
284
285unsigned int memorystatus_level = 0;
6d2010ae 286
316670eb 287static int memorystatus_list_count = 0;
6d2010ae 288
39236c6e 289#define MEMSTAT_BUCKET_COUNT (JETSAM_PRIORITY_MAX + 1)
6d2010ae 290
39236c6e
A
291typedef struct memstat_bucket {
292 TAILQ_HEAD(, proc) list;
293 int count;
294} memstat_bucket_t;
6d2010ae 295
39236c6e
A
296memstat_bucket_t memstat_bucket[MEMSTAT_BUCKET_COUNT];
297
5ba3f43e
A
298int memorystatus_get_proccnt_upto_priority(int32_t max_bucket_index);
299
39236c6e 300uint64_t memstat_idle_demotion_deadline = 0;
6d2010ae 301
39037602
A
302int system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
303int applications_aging_band = JETSAM_PRIORITY_IDLE;
304
305#define isProcessInAgingBands(p) ((isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) || (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)))
306#define isApp(p) (! (p->p_memstat_dirty & P_DIRTY_TRACK))
307#define isSysProc(p) ((p->p_memstat_dirty & P_DIRTY_TRACK))
308
309#define kJetsamAgingPolicyNone (0)
310#define kJetsamAgingPolicyLegacy (1)
311#define kJetsamAgingPolicySysProcsReclaimedFirst (2)
312#define kJetsamAgingPolicyAppsReclaimedFirst (3)
313#define kJetsamAgingPolicyMax kJetsamAgingPolicyAppsReclaimedFirst
314
315unsigned int jetsam_aging_policy = kJetsamAgingPolicyLegacy;
316
317extern int corpse_for_fatal_memkill;
5ba3f43e 318extern unsigned long total_corpses_count(void) __attribute__((pure));
39037602 319extern void task_purge_all_corpses(void);
5ba3f43e 320boolean_t memorystatus_allowed_vm_map_fork(__unused task_t);
39037602
A
321
322#if 0
323
324/* Keeping around for future use if we need a utility that can do this OR an app that needs a dynamic adjustment. */
325
326static int
327sysctl_set_jetsam_aging_policy SYSCTL_HANDLER_ARGS
328{
329#pragma unused(oidp, arg1, arg2)
330
331 int error = 0, val = 0;
332 memstat_bucket_t *old_bucket = 0;
333 int old_system_procs_aging_band = 0, new_system_procs_aging_band = 0;
334 int old_applications_aging_band = 0, new_applications_aging_band = 0;
335 proc_t p = NULL, next_proc = NULL;
336
337
338 error = sysctl_io_number(req, jetsam_aging_policy, sizeof(int), &val, NULL);
339 if (error || !req->newptr) {
340 return (error);
341 }
342
343 if ((val < 0) || (val > kJetsamAgingPolicyMax)) {
344 printf("jetsam: ordering policy sysctl has invalid value - %d\n", val);
345 return EINVAL;
346 }
347
348 /*
349 * We need to synchronize with any potential adding/removal from aging bands
350 * that might be in progress currently. We use the proc_list_lock() just for
351 * consistency with all the routines dealing with 'aging' processes. We need
352 * a lighterweight lock.
353 */
354 proc_list_lock();
355
356 old_system_procs_aging_band = system_procs_aging_band;
357 old_applications_aging_band = applications_aging_band;
358
359 switch (val) {
360
361 case kJetsamAgingPolicyNone:
362 new_system_procs_aging_band = JETSAM_PRIORITY_IDLE;
363 new_applications_aging_band = JETSAM_PRIORITY_IDLE;
364 break;
365
366 case kJetsamAgingPolicyLegacy:
367 /*
368 * Legacy behavior where some daemons get a 10s protection once and only before the first clean->dirty->clean transition before going into IDLE band.
369 */
370 new_system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
371 new_applications_aging_band = JETSAM_PRIORITY_IDLE;
372 break;
373
374 case kJetsamAgingPolicySysProcsReclaimedFirst:
375 new_system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
376 new_applications_aging_band = JETSAM_PRIORITY_AGING_BAND2;
377 break;
378
379 case kJetsamAgingPolicyAppsReclaimedFirst:
380 new_system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND2;
381 new_applications_aging_band = JETSAM_PRIORITY_AGING_BAND1;
382 break;
383
384 default:
385 break;
386 }
387
388 if (old_system_procs_aging_band && (old_system_procs_aging_band != new_system_procs_aging_band)) {
389
390 old_bucket = &memstat_bucket[old_system_procs_aging_band];
391 p = TAILQ_FIRST(&old_bucket->list);
392
393 while (p) {
394
395 next_proc = TAILQ_NEXT(p, p_memstat_list);
396
397 if (isSysProc(p)) {
398 if (new_system_procs_aging_band == JETSAM_PRIORITY_IDLE) {
399 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
400 }
401
402 memorystatus_update_priority_locked(p, new_system_procs_aging_band, false, true);
403 }
404
405 p = next_proc;
406 continue;
407 }
408 }
409
410 if (old_applications_aging_band && (old_applications_aging_band != new_applications_aging_band)) {
411
412 old_bucket = &memstat_bucket[old_applications_aging_band];
413 p = TAILQ_FIRST(&old_bucket->list);
414
415 while (p) {
416
417 next_proc = TAILQ_NEXT(p, p_memstat_list);
418
419 if (isApp(p)) {
420 if (new_applications_aging_band == JETSAM_PRIORITY_IDLE) {
421 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
422 }
423
424 memorystatus_update_priority_locked(p, new_applications_aging_band, false, true);
425 }
426
427 p = next_proc;
428 continue;
429 }
430 }
431
432 jetsam_aging_policy = val;
433 system_procs_aging_band = new_system_procs_aging_band;
434 applications_aging_band = new_applications_aging_band;
435
436 proc_list_unlock();
437
438 return (0);
439}
440
441SYSCTL_PROC(_kern, OID_AUTO, set_jetsam_aging_policy, CTLTYPE_INT|CTLFLAG_RW,
442 0, 0, sysctl_set_jetsam_aging_policy, "I", "Jetsam Aging Policy");
443#endif /*0*/
444
445static int
446sysctl_jetsam_set_sysprocs_idle_delay_time SYSCTL_HANDLER_ARGS
447{
448#pragma unused(oidp, arg1, arg2)
449
450 int error = 0, val = 0, old_time_in_secs = 0;
451 uint64_t old_time_in_ns = 0;
452
453 absolutetime_to_nanoseconds(memorystatus_sysprocs_idle_delay_time, &old_time_in_ns);
454 old_time_in_secs = old_time_in_ns / NSEC_PER_SEC;
455
456 error = sysctl_io_number(req, old_time_in_secs, sizeof(int), &val, NULL);
457 if (error || !req->newptr) {
458 return (error);
459 }
460
461 if ((val < 0) || (val > INT32_MAX)) {
462 printf("jetsam: new idle delay interval has invalid value.\n");
463 return EINVAL;
464 }
465
466 nanoseconds_to_absolutetime((uint64_t)val * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time);
467
468 return(0);
469}
470
471SYSCTL_PROC(_kern, OID_AUTO, memorystatus_sysprocs_idle_delay_time, CTLTYPE_INT|CTLFLAG_RW,
472 0, 0, sysctl_jetsam_set_sysprocs_idle_delay_time, "I", "Aging window for system processes");
473
474
475static int
476sysctl_jetsam_set_apps_idle_delay_time SYSCTL_HANDLER_ARGS
477{
478#pragma unused(oidp, arg1, arg2)
479
480 int error = 0, val = 0, old_time_in_secs = 0;
481 uint64_t old_time_in_ns = 0;
482
483 absolutetime_to_nanoseconds(memorystatus_apps_idle_delay_time, &old_time_in_ns);
484 old_time_in_secs = old_time_in_ns / NSEC_PER_SEC;
485
486 error = sysctl_io_number(req, old_time_in_secs, sizeof(int), &val, NULL);
487 if (error || !req->newptr) {
488 return (error);
489 }
490
491 if ((val < 0) || (val > INT32_MAX)) {
492 printf("jetsam: new idle delay interval has invalid value.\n");
493 return EINVAL;
494 }
495
496 nanoseconds_to_absolutetime((uint64_t)val * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time);
497
498 return(0);
499}
500
501SYSCTL_PROC(_kern, OID_AUTO, memorystatus_apps_idle_delay_time, CTLTYPE_INT|CTLFLAG_RW,
502 0, 0, sysctl_jetsam_set_apps_idle_delay_time, "I", "Aging window for applications");
503
504SYSCTL_INT(_kern, OID_AUTO, jetsam_aging_policy, CTLTYPE_INT|CTLFLAG_RD, &jetsam_aging_policy, 0, "");
505
316670eb 506static unsigned int memorystatus_dirty_count = 0;
6d2010ae 507
3e170ce0 508SYSCTL_INT(_kern, OID_AUTO, max_task_pmem, CTLFLAG_RD|CTLFLAG_LOCKED|CTLFLAG_MASKED, &max_task_footprint_mb, 0, "");
3e170ce0 509
5ba3f43e
A
510#if CONFIG_EMBEDDED
511
512SYSCTL_INT(_kern, OID_AUTO, memorystatus_level, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_level, 0, "");
513
514#endif /* CONFIG_EMBEDDED */
39236c6e
A
515
516int
517memorystatus_get_level(__unused struct proc *p, struct memorystatus_get_level_args *args, __unused int *ret)
518{
519 user_addr_t level = 0;
520
521 level = args->level;
522
523 if (copyout(&memorystatus_level, level, sizeof(memorystatus_level)) != 0) {
524 return EFAULT;
525 }
526
527 return 0;
528}
529
530static proc_t memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search);
531static proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search);
532
533static void memorystatus_thread(void *param __unused, wait_result_t wr __unused);
6d2010ae 534
39037602
A
535/* Memory Limits */
536
537static int memorystatus_highwater_enabled = 1; /* Update the cached memlimit data. */
538
539static boolean_t proc_jetsam_state_is_active_locked(proc_t);
540static boolean_t memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason);
541static boolean_t memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason);
542
543
3e170ce0
A
544static int memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
545
546static int memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry);
547
548static int memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
549
39037602 550static int memorystatus_cmd_get_memlimit_excess_np(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
3e170ce0 551
fe8ab488
A
552int proc_get_memstat_priority(proc_t, boolean_t);
553
fe8ab488 554static boolean_t memorystatus_idle_snapshot = 0;
39236c6e 555
316670eb
A
556unsigned int memorystatus_delta = 0;
557
3e170ce0 558/* Jetsam Loop Detection */
5ba3f43e 559static boolean_t memorystatus_jld_enabled = FALSE; /* Enable jetsam loop detection */
3e170ce0
A
560static uint32_t memorystatus_jld_eval_period_msecs = 0; /* Init pass sets this based on device memory size */
561static int memorystatus_jld_eval_aggressive_count = 3; /* Raise the priority max after 'n' aggressive loops */
562static int memorystatus_jld_eval_aggressive_priority_band_max = 15; /* Kill aggressively up through this band */
563
490019cf
A
564/*
565 * A FG app can request that the aggressive jetsam mechanism display some leniency in the FG band. This 'lenient' mode is described as:
566 * --- if aggressive jetsam kills an app in the FG band and gets back >=AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD memory, it will stop the aggressive march further into and up the jetsam bands.
567 *
568 * RESTRICTIONS:
569 * - Such a request is respected/acknowledged only once while that 'requesting' app is in the FG band i.e. if aggressive jetsam was
570 * needed and the 'lenient' mode was deployed then that's it for this special mode while the app is in the FG band.
571 *
572 * - If the app is still in the FG band and aggressive jetsam is needed again, there will be no stop-and-check the next time around.
573 *
574 * - Also, the transition of the 'requesting' app away from the FG band will void this special behavior.
575 */
576
577#define AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD 25
578boolean_t memorystatus_aggressive_jetsam_lenient_allowed = FALSE;
579boolean_t memorystatus_aggressive_jetsam_lenient = FALSE;
580
3e170ce0
A
581#if DEVELOPMENT || DEBUG
582/*
583 * Jetsam Loop Detection tunables.
584 */
585
586SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_period_msecs, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_jld_eval_period_msecs, 0, "");
587SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_aggressive_count, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_jld_eval_aggressive_count, 0, "");
588SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_aggressive_priority_band_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_jld_eval_aggressive_priority_band_max, 0, "");
589#endif /* DEVELOPMENT || DEBUG */
590
fe8ab488 591static uint32_t kill_under_pressure_cause = 0;
316670eb 592
3e170ce0
A
593/*
594 * default jetsam snapshot support
595 */
39236c6e
A
596static memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot;
597#define memorystatus_jetsam_snapshot_list memorystatus_jetsam_snapshot->entries
39236c6e
A
598static unsigned int memorystatus_jetsam_snapshot_count = 0;
599static unsigned int memorystatus_jetsam_snapshot_max = 0;
3e170ce0
A
600static uint64_t memorystatus_jetsam_snapshot_last_timestamp = 0;
601static uint64_t memorystatus_jetsam_snapshot_timeout = 0;
602#define JETSAM_SNAPSHOT_TIMEOUT_SECS 30
603
604/*
605 * snapshot support for memstats collected at boot.
606 */
607static memorystatus_jetsam_snapshot_t memorystatus_at_boot_snapshot;
316670eb 608
39037602
A
609static void memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count);
610static boolean_t memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount);
611static void memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime);
612
39236c6e 613static void memorystatus_clear_errors(void);
fe8ab488 614static void memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages);
39037602
A
615static void memorystatus_get_task_phys_footprint_page_counts(task_t task,
616 uint64_t *internal_pages, uint64_t *internal_compressed_pages,
617 uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages,
618 uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages,
619 uint64_t *iokit_mapped_pages, uint64_t *page_table_pages);
620
621static void memorystatus_get_task_memory_region_count(task_t task, uint64_t *count);
622
39236c6e 623static uint32_t memorystatus_build_state(proc_t p);
fe8ab488 624//static boolean_t memorystatus_issue_pressure_kevent(boolean_t pressured);
39236c6e 625
39037602 626static boolean_t memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, os_reason_t jetsam_reason, int32_t *priority, uint32_t *errors);
5ba3f43e 627static boolean_t memorystatus_kill_top_process_aggressive(uint32_t cause, int aggr_count, int32_t priority_max, uint32_t *errors);
39037602 628static boolean_t memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, int aggr_count, uint32_t *errors);
39236c6e 629static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors);
39236c6e
A
630
631static boolean_t memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause);
316670eb 632
3e170ce0
A
633/* Priority Band Sorting Routines */
634static int memorystatus_sort_bucket(unsigned int bucket_index, int sort_order);
635static int memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coal_sort_order);
636static void memorystatus_sort_by_largest_process_locked(unsigned int bucket_index);
637static int memorystatus_move_list_locked(unsigned int bucket_index, pid_t *pid_list, int list_sz);
638
639/* qsort routines */
640typedef int (*cmpfunc_t)(const void *a, const void *b);
641extern void qsort(void *a, size_t n, size_t es, cmpfunc_t cmp);
642static int memstat_asc_cmp(const void *a, const void *b);
643
316670eb 644/* VM pressure */
6d2010ae 645
fe8ab488
A
646extern unsigned int vm_page_free_count;
647extern unsigned int vm_page_active_count;
648extern unsigned int vm_page_inactive_count;
649extern unsigned int vm_page_throttled_count;
650extern unsigned int vm_page_purgeable_count;
651extern unsigned int vm_page_wire_count;
39037602
A
652#if CONFIG_SECLUDED_MEMORY
653extern unsigned int vm_page_secluded_count;
654#endif /* CONFIG_SECLUDED_MEMORY */
fe8ab488 655
5ba3f43e 656#if CONFIG_JETSAM
fe8ab488
A
657unsigned int memorystatus_available_pages = (unsigned int)-1;
658unsigned int memorystatus_available_pages_pressure = 0;
659unsigned int memorystatus_available_pages_critical = 0;
5ba3f43e
A
660static unsigned int memorystatus_available_pages_critical_base = 0;
661static unsigned int memorystatus_available_pages_critical_idle_offset = 0;
fe8ab488 662
00867663
A
663#if DEVELOPMENT || DEBUG
664SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
665#else
5ba3f43e 666SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
00867663 667#endif /* DEVELOPMENT || DEBUG */
5ba3f43e
A
668
669static unsigned int memorystatus_jetsam_policy = kPolicyDefault;
670unsigned int memorystatus_policy_more_free_offset_pages = 0;
671static void memorystatus_update_levels_locked(boolean_t critical_only);
672static unsigned int memorystatus_thread_wasted_wakeup = 0;
673
674/* Callback into vm_compressor.c to signal that thrashing has been mitigated. */
675extern void vm_thrashing_jetsam_done(void);
676static int memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit);
677
678int32_t max_kill_priority = JETSAM_PRIORITY_MAX;
679
680#else /* CONFIG_JETSAM */
681
682uint64_t memorystatus_available_pages = (uint64_t)-1;
683uint64_t memorystatus_available_pages_pressure = (uint64_t)-1;
684uint64_t memorystatus_available_pages_critical = (uint64_t)-1;
685
686int32_t max_kill_priority = JETSAM_PRIORITY_IDLE;
00867663
A
687#endif /* CONFIG_JETSAM */
688
5ba3f43e
A
689unsigned int memorystatus_frozen_count = 0;
690unsigned int memorystatus_suspended_count = 0;
691
692#if VM_PRESSURE_EVENTS
693
694boolean_t memorystatus_warn_process(pid_t pid, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t exceeded);
695
696vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal;
697
fe8ab488
A
698/*
699 * We use this flag to signal if we have any HWM offenders
700 * on the system. This way we can reduce the number of wakeups
701 * of the memorystatus_thread when the system is between the
702 * "pressure" and "critical" threshold.
703 *
704 * The (re-)setting of this variable is done without any locks
705 * or synchronization simply because it is not possible (currently)
706 * to keep track of HWM offenders that drop down below their memory
707 * limit and/or exit. So, we choose to burn a couple of wasted wakeups
708 * by allowing the unguarded modification of this variable.
709 */
710boolean_t memorystatus_hwm_candidates = 0;
711
712static int memorystatus_send_note(int event_code, void *data, size_t data_length);
fe8ab488 713
316670eb
A
714#endif /* VM_PRESSURE_EVENTS */
715
39037602
A
716
717#if DEVELOPMENT || DEBUG
718
719lck_grp_attr_t *disconnect_page_mappings_lck_grp_attr;
720lck_grp_t *disconnect_page_mappings_lck_grp;
721static lck_mtx_t disconnect_page_mappings_mutex;
722
5ba3f43e
A
723extern boolean_t kill_on_no_paging_space;
724#endif /* DEVELOPMENT || DEBUG */
39037602
A
725
726
316670eb
A
727/* Freeze */
728
729#if CONFIG_FREEZE
730
316670eb
A
731boolean_t memorystatus_freeze_enabled = FALSE;
732int memorystatus_freeze_wakeup = 0;
733
3e170ce0
A
734lck_grp_attr_t *freezer_lck_grp_attr;
735lck_grp_t *freezer_lck_grp;
736static lck_mtx_t freezer_mutex;
737
316670eb
A
738static inline boolean_t memorystatus_can_freeze_processes(void);
739static boolean_t memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low);
740
741static void memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused);
742
743/* Thresholds */
744static unsigned int memorystatus_freeze_threshold = 0;
745
fe8ab488
A
746static unsigned int memorystatus_freeze_pages_min = 0;
747static unsigned int memorystatus_freeze_pages_max = 0;
316670eb
A
748
749static unsigned int memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT;
750
3e170ce0
A
751static unsigned int memorystatus_freeze_daily_mb_max = FREEZE_DAILY_MB_MAX_DEFAULT;
752
316670eb
A
753/* Stats */
754static uint64_t memorystatus_freeze_count = 0;
755static uint64_t memorystatus_freeze_pageouts = 0;
6d2010ae
A
756
757/* Throttling */
316670eb
A
758static throttle_interval_t throttle_intervals[] = {
759 { 60, 8, 0, 0, { 0, 0 }, FALSE }, /* 1 hour intermediate interval, 8x burst */
6d2010ae
A
760 { 24 * 60, 1, 0, 0, { 0, 0 }, FALSE }, /* 24 hour long interval, no burst */
761};
762
316670eb 763static uint64_t memorystatus_freeze_throttle_count = 0;
6d2010ae 764
39037602 765static unsigned int memorystatus_suspended_footprint_total = 0; /* pages */
6d2010ae 766
3e170ce0
A
767extern uint64_t vm_swap_get_free_space(void);
768
5ba3f43e 769static boolean_t memorystatus_freeze_update_throttle(void);
3e170ce0 770
39236c6e 771#endif /* CONFIG_FREEZE */
6d2010ae 772
316670eb 773/* Debug */
6d2010ae 774
fe8ab488
A
775extern struct knote *vm_find_knote_from_pid(pid_t, struct klist *);
776
6d2010ae 777#if DEVELOPMENT || DEBUG
6d2010ae 778
39037602 779static unsigned int memorystatus_debug_dump_this_bucket = 0;
39236c6e 780
3e170ce0
A
781static void
782memorystatus_debug_dump_bucket_locked (unsigned int bucket_index)
783{
784 proc_t p = NULL;
39037602
A
785 uint64_t bytes = 0;
786 int ledger_limit = 0;
3e170ce0
A
787 unsigned int b = bucket_index;
788 boolean_t traverse_all_buckets = FALSE;
789
790 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
791 traverse_all_buckets = TRUE;
792 b = 0;
793 } else {
794 traverse_all_buckets = FALSE;
795 b = bucket_index;
796 }
797
798 /*
39037602
A
799 * footprint reported in [pages / MB ]
800 * limits reported as:
801 * L-limit proc's Ledger limit
802 * C-limit proc's Cached limit, should match Ledger
803 * A-limit proc's Active limit
804 * IA-limit proc's Inactive limit
805 * F==Fatal, NF==NonFatal
3e170ce0 806 */
39037602
A
807
808 printf("memorystatus_debug_dump ***START*(PAGE_SIZE_64=%llu)**\n", PAGE_SIZE_64);
809 printf("bucket [pid] [pages / MB] [state] [EP / RP] dirty deadline [L-limit / C-limit / A-limit / IA-limit] name\n");
3e170ce0
A
810 p = memorystatus_get_first_proc_locked(&b, traverse_all_buckets);
811 while (p) {
39037602
A
812 bytes = get_task_phys_footprint(p->task);
813 task_get_phys_footprint_limit(p->task, &ledger_limit);
814 printf("%2d [%5d] [%5lld /%3lldMB] 0x%-8x [%2d / %2d] 0x%-3x %10lld [%3d / %3d%s / %3d%s / %3d%s] %s\n",
815 b, p->p_pid,
816 (bytes / PAGE_SIZE_64), /* task's footprint converted from bytes to pages */
817 (bytes / (1024ULL * 1024ULL)), /* task's footprint converted from bytes to MB */
3e170ce0 818 p->p_memstat_state, p->p_memstat_effectivepriority, p->p_memstat_requestedpriority, p->p_memstat_dirty, p->p_memstat_idledeadline,
39037602 819 ledger_limit,
3e170ce0
A
820 p->p_memstat_memlimit,
821 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"),
822 p->p_memstat_memlimit_active,
823 (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL ? "F " : "NF"),
824 p->p_memstat_memlimit_inactive,
825 (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL ? "F " : "NF"),
39037602 826 (*p->p_name ? p->p_name : "unknown"));
3e170ce0
A
827 p = memorystatus_get_next_proc_locked(&b, p, traverse_all_buckets);
828 }
829 printf("memorystatus_debug_dump ***END***\n");
830}
831
832static int
833sysctl_memorystatus_debug_dump_bucket SYSCTL_HANDLER_ARGS
834{
835#pragma unused(oidp, arg2)
836 int bucket_index = 0;
837 int error;
838 error = SYSCTL_OUT(req, arg1, sizeof(int));
839 if (error || !req->newptr) {
840 return (error);
841 }
842 error = SYSCTL_IN(req, &bucket_index, sizeof(int));
843 if (error || !req->newptr) {
844 return (error);
845 }
846 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
847 /*
848 * All jetsam buckets will be dumped.
849 */
850 } else {
851 /*
852 * Only a single bucket will be dumped.
853 */
854 }
855
856 proc_list_lock();
857 memorystatus_debug_dump_bucket_locked(bucket_index);
858 proc_list_unlock();
859 memorystatus_debug_dump_this_bucket = bucket_index;
860 return (error);
861}
862
863/*
864 * Debug aid to look at jetsam buckets and proc jetsam fields.
865 * Use this sysctl to act on a particular jetsam bucket.
866 * Writing the sysctl triggers the dump.
867 * Usage: sysctl kern.memorystatus_debug_dump_this_bucket=<bucket_index>
868 */
869
870SYSCTL_PROC(_kern, OID_AUTO, memorystatus_debug_dump_this_bucket, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_debug_dump_this_bucket, 0, sysctl_memorystatus_debug_dump_bucket, "I", "");
871
872
39236c6e
A
873/* Debug aid to aid determination of limit */
874
875static int
876sysctl_memorystatus_highwater_enable SYSCTL_HANDLER_ARGS
877{
878#pragma unused(oidp, arg2)
879 proc_t p;
880 unsigned int b = 0;
881 int error, enable = 0;
813fb2f6
A
882 boolean_t use_active; /* use the active limit and active limit attributes */
883 boolean_t is_fatal;
39236c6e
A
884
885 error = SYSCTL_OUT(req, arg1, sizeof(int));
886 if (error || !req->newptr) {
887 return (error);
888 }
889
890 error = SYSCTL_IN(req, &enable, sizeof(int));
891 if (error || !req->newptr) {
892 return (error);
893 }
894
895 if (!(enable == 0 || enable == 1)) {
896 return EINVAL;
897 }
898
899 proc_list_lock();
900
901 p = memorystatus_get_first_proc_locked(&b, TRUE);
902 while (p) {
813fb2f6 903 use_active = proc_jetsam_state_is_active_locked(p);
3e170ce0 904
39236c6e 905 if (enable) {
3e170ce0 906
813fb2f6
A
907 if (use_active == TRUE) {
908 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
39236c6e 909 } else {
813fb2f6 910 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
39236c6e 911 }
3e170ce0 912
39236c6e 913 } else {
3e170ce0
A
914 /*
915 * Disabling limits does not touch the stored variants.
916 * Set the cached limit fields to system_wide defaults.
917 */
918 p->p_memstat_memlimit = -1;
919 p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
813fb2f6 920 is_fatal = TRUE;
fe8ab488 921 }
3e170ce0
A
922
923 /*
924 * Enforce the cached limit by writing to the ledger.
925 */
813fb2f6 926 task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit: -1, NULL, use_active, is_fatal);
3e170ce0 927
39236c6e
A
928 p = memorystatus_get_next_proc_locked(&b, p, TRUE);
929 }
930
931 memorystatus_highwater_enabled = enable;
932
933 proc_list_unlock();
934
935 return 0;
3e170ce0 936
39236c6e
A
937}
938
939SYSCTL_PROC(_kern, OID_AUTO, memorystatus_highwater_enabled, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_highwater_enabled, 0, sysctl_memorystatus_highwater_enable, "I", "");
940
39037602
A
941#if VM_PRESSURE_EVENTS
942
943/*
813fb2f6
A
944 * This routine is used for targeted notifications regardless of system memory pressure
945 * and regardless of whether or not the process has already been notified.
946 * It bypasses and has no effect on the only-one-notification per soft-limit policy.
947 *
39037602
A
948 * "memnote" is the current user.
949 */
950
951static int
952sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS
953{
954#pragma unused(arg1, arg2)
955
956 int error = 0, pid = 0;
957 struct knote *kn = NULL;
958 boolean_t found_knote = FALSE;
959 int fflags = 0; /* filter flags for EVFILT_MEMORYSTATUS */
960 uint64_t value = 0;
961
962 error = sysctl_handle_quad(oidp, &value, 0, req);
963 if (error || !req->newptr)
964 return (error);
965
966 /*
967 * Find the pid in the low 32 bits of value passed in.
968 */
969 pid = (int)(value & 0xFFFFFFFF);
970
971 /*
972 * Find notification in the high 32 bits of the value passed in.
973 */
974 fflags = (int)((value >> 32) & 0xFFFFFFFF);
975
976 /*
977 * For backwards compatibility, when no notification is
978 * passed in, default to the NOTE_MEMORYSTATUS_PRESSURE_WARN
979 */
980 if (fflags == 0) {
981 fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
982 // printf("memorystatus_vm_pressure_send: using default notification [0x%x]\n", fflags);
983 }
984
985 /*
986 * See event.h ... fflags for EVFILT_MEMORYSTATUS
987 */
5ba3f43e
A
988 if (!((fflags == NOTE_MEMORYSTATUS_PRESSURE_NORMAL)||
989 (fflags == NOTE_MEMORYSTATUS_PRESSURE_WARN) ||
990 (fflags == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) ||
991 (fflags == NOTE_MEMORYSTATUS_LOW_SWAP) ||
992 (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) ||
993 (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) ||
994 (((fflags & NOTE_MEMORYSTATUS_MSL_STATUS) != 0 &&
995 ((fflags & ~NOTE_MEMORYSTATUS_MSL_STATUS) == 0))))) {
39037602
A
996
997 printf("memorystatus_vm_pressure_send: notification [0x%x] not supported \n", fflags);
998 error = 1;
999 return (error);
1000 }
1001
1002 /*
1003 * Forcibly send pid a memorystatus notification.
1004 */
1005
1006 memorystatus_klist_lock();
1007
1008 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1009 proc_t knote_proc = knote_get_kq(kn)->kq_p;
1010 pid_t knote_pid = knote_proc->p_pid;
1011
1012 if (knote_pid == pid) {
1013 /*
1014 * Forcibly send this pid a memorystatus notification.
1015 */
1016 kn->kn_fflags = fflags;
1017 found_knote = TRUE;
1018 }
1019 }
1020
1021 if (found_knote) {
1022 KNOTE(&memorystatus_klist, 0);
1023 printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] sent to process [%d] \n", value, fflags, pid);
1024 error = 0;
1025 } else {
1026 printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] not sent to process [%d] (none registered?)\n", value, fflags, pid);
1027 error = 1;
1028 }
1029
1030 memorystatus_klist_unlock();
1031
1032 return (error);
1033}
1034
1035SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_QUAD|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
1036 0, 0, &sysctl_memorystatus_vm_pressure_send, "Q", "");
1037
1038#endif /* VM_PRESSURE_EVENTS */
1039
39037602
A
1040SYSCTL_INT(_kern, OID_AUTO, memorystatus_idle_snapshot, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_idle_snapshot, 0, "");
1041
5ba3f43e 1042#if CONFIG_JETSAM
39236c6e
A
1043SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_available_pages_critical, 0, "");
1044SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_base, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_critical_base, 0, "");
39236c6e 1045SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_idle_offset, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_critical_idle_offset, 0, "");
39037602 1046SYSCTL_UINT(_kern, OID_AUTO, memorystatus_policy_more_free_offset_pages, CTLFLAG_RW, &memorystatus_policy_more_free_offset_pages, 0, "");
316670eb 1047
5ba3f43e
A
1048static unsigned int memorystatus_jetsam_panic_debug = 0;
1049static unsigned int memorystatus_jetsam_policy_offset_pages_diagnostic = 0;
1050
316670eb 1051/* Diagnostic code */
39236c6e 1052
316670eb
A
1053enum {
1054 kJetsamDiagnosticModeNone = 0,
1055 kJetsamDiagnosticModeAll = 1,
1056 kJetsamDiagnosticModeStopAtFirstActive = 2,
1057 kJetsamDiagnosticModeCount
1058} jetsam_diagnostic_mode = kJetsamDiagnosticModeNone;
1059
1060static int jetsam_diagnostic_suspended_one_active_proc = 0;
1061
1062static int
1063sysctl_jetsam_diagnostic_mode SYSCTL_HANDLER_ARGS
1064{
1065#pragma unused(arg1, arg2)
1066
1067 const char *diagnosticStrings[] = {
1068 "jetsam: diagnostic mode: resetting critical level.",
1069 "jetsam: diagnostic mode: will examine all processes",
1070 "jetsam: diagnostic mode: will stop at first active process"
1071 };
1072
1073 int error, val = jetsam_diagnostic_mode;
1074 boolean_t changed = FALSE;
1075
1076 error = sysctl_handle_int(oidp, &val, 0, req);
1077 if (error || !req->newptr)
1078 return (error);
1079 if ((val < 0) || (val >= kJetsamDiagnosticModeCount)) {
1080 printf("jetsam: diagnostic mode: invalid value - %d\n", val);
1081 return EINVAL;
1082 }
1083
39236c6e 1084 proc_list_lock();
316670eb
A
1085
1086 if ((unsigned int) val != jetsam_diagnostic_mode) {
1087 jetsam_diagnostic_mode = val;
1088
1089 memorystatus_jetsam_policy &= ~kPolicyDiagnoseActive;
1090
1091 switch (jetsam_diagnostic_mode) {
1092 case kJetsamDiagnosticModeNone:
1093 /* Already cleared */
1094 break;
1095 case kJetsamDiagnosticModeAll:
1096 memorystatus_jetsam_policy |= kPolicyDiagnoseAll;
1097 break;
1098 case kJetsamDiagnosticModeStopAtFirstActive:
1099 memorystatus_jetsam_policy |= kPolicyDiagnoseFirst;
1100 break;
1101 default:
1102 /* Already validated */
1103 break;
1104 }
1105
39236c6e 1106 memorystatus_update_levels_locked(FALSE);
316670eb
A
1107 changed = TRUE;
1108 }
1109
39236c6e 1110 proc_list_unlock();
316670eb
A
1111
1112 if (changed) {
1113 printf("%s\n", diagnosticStrings[val]);
1114 }
1115
1116 return (0);
1117}
1118
39236c6e 1119SYSCTL_PROC(_debug, OID_AUTO, jetsam_diagnostic_mode, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED|CTLFLAG_ANYBODY,
316670eb
A
1120 &jetsam_diagnostic_mode, 0, sysctl_jetsam_diagnostic_mode, "I", "Jetsam Diagnostic Mode");
1121
39236c6e 1122SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jetsam_policy_offset_pages_diagnostic, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_jetsam_policy_offset_pages_diagnostic, 0, "");
316670eb
A
1123
1124#if VM_PRESSURE_EVENTS
1125
39236c6e 1126SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_pressure, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_pressure, 0, "");
316670eb 1127
39037602 1128#endif /* VM_PRESSURE_EVENTS */
316670eb 1129
39037602
A
1130#endif /* CONFIG_JETSAM */
1131
1132#if CONFIG_FREEZE
1133
1134SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_daily_mb_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_daily_mb_max, 0, "");
1135
1136SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_threshold, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_threshold, 0, "");
1137
1138SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_min, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_min, 0, "");
1139SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_max, 0, "");
1140
1141SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_count, "");
1142SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_pageouts, "");
1143SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_throttle_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_count, "");
1144SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_min_processes, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_suspended_threshold, 0, "");
1145
1146boolean_t memorystatus_freeze_throttle_enabled = TRUE;
1147SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_throttle_enabled, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_enabled, 0, "");
316670eb 1148
d190cdc3 1149#define VM_PAGES_FOR_ALL_PROCS (2)
39037602
A
1150/*
1151 * Manual trigger of freeze and thaw for dev / debug kernels only.
1152 */
316670eb 1153static int
39037602 1154sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS
316670eb
A
1155{
1156#pragma unused(arg1, arg2)
39037602
A
1157 int error, pid = 0;
1158 proc_t p;
316670eb 1159
39037602
A
1160 if (memorystatus_freeze_enabled == FALSE) {
1161 return ENOTSUP;
1162 }
fe8ab488 1163
316670eb
A
1164 error = sysctl_handle_int(oidp, &pid, 0, req);
1165 if (error || !req->newptr)
1166 return (error);
1167
d190cdc3 1168 if (pid == VM_PAGES_FOR_ALL_PROCS) {
3e170ce0
A
1169 vm_pageout_anonymous_pages();
1170
1171 return 0;
1172 }
1173
1174 lck_mtx_lock(&freezer_mutex);
1175
316670eb
A
1176 p = proc_find(pid);
1177 if (p != NULL) {
1178 uint32_t purgeable, wired, clean, dirty;
1179 boolean_t shared;
39236c6e
A
1180 uint32_t max_pages = 0;
1181
39037602 1182 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
3e170ce0
A
1183
1184 unsigned int avail_swap_space = 0; /* in pages. */
1185
39037602
A
1186 /*
1187 * Freezer backed by the compressor and swap file(s)
1188 * while will hold compressed data.
1189 */
1190 avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64;
3e170ce0
A
1191
1192 max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max);
1193
39236c6e 1194 } else {
3e170ce0
A
1195 /*
1196 * We only have the compressor without any swap.
1197 */
39236c6e
A
1198 max_pages = UINT32_MAX - 1;
1199 }
3e170ce0 1200
39236c6e 1201 error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
316670eb 1202 proc_rele(p);
316670eb 1203
39236c6e
A
1204 if (error)
1205 error = EIO;
3e170ce0
A
1206
1207 lck_mtx_unlock(&freezer_mutex);
39236c6e
A
1208 return error;
1209 }
3e170ce0
A
1210
1211 lck_mtx_unlock(&freezer_mutex);
316670eb
A
1212 return EINVAL;
1213}
1214
1215SYSCTL_PROC(_kern, OID_AUTO, memorystatus_freeze, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
1216 0, 0, &sysctl_memorystatus_freeze, "I", "");
1217
1218static int
1219sysctl_memorystatus_available_pages_thaw SYSCTL_HANDLER_ARGS
1220{
1221#pragma unused(arg1, arg2)
1222
1223 int error, pid = 0;
1224 proc_t p;
1225
fe8ab488
A
1226 if (memorystatus_freeze_enabled == FALSE) {
1227 return ENOTSUP;
1228 }
1229
316670eb
A
1230 error = sysctl_handle_int(oidp, &pid, 0, req);
1231 if (error || !req->newptr)
1232 return (error);
1233
d190cdc3
A
1234 if (pid == VM_PAGES_FOR_ALL_PROCS) {
1235 do_fastwake_warmup_all();
1236 return 0;
1237 } else {
1238 p = proc_find(pid);
1239 if (p != NULL) {
1240 error = task_thaw(p->task);
1241 proc_rele(p);
1242
1243 if (error)
1244 error = EIO;
1245 return error;
1246 }
316670eb
A
1247 }
1248
1249 return EINVAL;
1250}
1251
1252SYSCTL_PROC(_kern, OID_AUTO, memorystatus_thaw, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
1253 0, 0, &sysctl_memorystatus_available_pages_thaw, "I", "");
6d2010ae 1254
6d2010ae 1255#endif /* CONFIG_FREEZE */
2d21ac55 1256
fe8ab488
A
1257#endif /* DEVELOPMENT || DEBUG */
1258
39236c6e
A
1259extern kern_return_t kernel_thread_start_priority(thread_continue_t continuation,
1260 void *parameter,
1261 integer_t priority,
1262 thread_t *new_thread);
1263
39037602
A
1264#if DEVELOPMENT || DEBUG
1265
1266static int
1267sysctl_memorystatus_disconnect_page_mappings SYSCTL_HANDLER_ARGS
1268{
1269#pragma unused(arg1, arg2)
1270 int error = 0, pid = 0;
1271 proc_t p;
1272
1273 error = sysctl_handle_int(oidp, &pid, 0, req);
1274 if (error || !req->newptr)
1275 return (error);
1276
1277 lck_mtx_lock(&disconnect_page_mappings_mutex);
1278
1279 if (pid == -1) {
1280 vm_pageout_disconnect_all_pages();
1281 } else {
1282 p = proc_find(pid);
1283
1284 if (p != NULL) {
1285 error = task_disconnect_page_mappings(p->task);
1286
1287 proc_rele(p);
1288
1289 if (error)
1290 error = EIO;
1291 } else
1292 error = EINVAL;
1293 }
1294 lck_mtx_unlock(&disconnect_page_mappings_mutex);
1295
1296 return error;
1297}
1298
1299SYSCTL_PROC(_kern, OID_AUTO, memorystatus_disconnect_page_mappings, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
1300 0, 0, &sysctl_memorystatus_disconnect_page_mappings, "I", "");
1301
1302#endif /* DEVELOPMENT || DEBUG */
1303
1304
3e170ce0
A
1305/*
1306 * Picks the sorting routine for a given jetsam priority band.
1307 *
1308 * Input:
1309 * bucket_index - jetsam priority band to be sorted.
1310 * sort_order - JETSAM_SORT_xxx from kern_memorystatus.h
1311 * Currently sort_order is only meaningful when handling
1312 * coalitions.
1313 *
1314 * Return:
1315 * 0 on success
1316 * non-0 on failure
1317 */
1318static int memorystatus_sort_bucket(unsigned int bucket_index, int sort_order)
1319{
1320 int coal_sort_order;
1321
1322 /*
1323 * Verify the jetsam priority
1324 */
1325 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1326 return(EINVAL);
1327 }
1328
1329#if DEVELOPMENT || DEBUG
1330 if (sort_order == JETSAM_SORT_DEFAULT) {
1331 coal_sort_order = COALITION_SORT_DEFAULT;
1332 } else {
1333 coal_sort_order = sort_order; /* only used for testing scenarios */
1334 }
1335#else
1336 /* Verify default */
1337 if (sort_order == JETSAM_SORT_DEFAULT) {
1338 coal_sort_order = COALITION_SORT_DEFAULT;
1339 } else {
1340 return(EINVAL);
1341 }
1342#endif
1343
1344 proc_list_lock();
5ba3f43e
A
1345
1346 if (memstat_bucket[bucket_index].count == 0) {
1347 proc_list_unlock();
1348 return (0);
1349 }
1350
3e170ce0
A
1351 switch (bucket_index) {
1352 case JETSAM_PRIORITY_FOREGROUND:
1353 if (memorystatus_sort_by_largest_coalition_locked(bucket_index, coal_sort_order) == 0) {
1354 /*
1355 * Fall back to per process sorting when zero coalitions are found.
1356 */
1357 memorystatus_sort_by_largest_process_locked(bucket_index);
1358 }
1359 break;
1360 default:
1361 memorystatus_sort_by_largest_process_locked(bucket_index);
1362 break;
1363 }
1364 proc_list_unlock();
1365
1366 return(0);
1367}
1368
fe8ab488
A
1369/*
1370 * Sort processes by size for a single jetsam bucket.
1371 */
1372
1373static void memorystatus_sort_by_largest_process_locked(unsigned int bucket_index)
1374{
1375 proc_t p = NULL, insert_after_proc = NULL, max_proc = NULL;
3e170ce0 1376 proc_t next_p = NULL, prev_max_proc = NULL;
fe8ab488
A
1377 uint32_t pages = 0, max_pages = 0;
1378 memstat_bucket_t *current_bucket;
1379
1380 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1381 return;
1382 }
1383
1384 current_bucket = &memstat_bucket[bucket_index];
1385
1386 p = TAILQ_FIRST(&current_bucket->list);
1387
3e170ce0 1388 while (p) {
fe8ab488
A
1389 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
1390 max_pages = pages;
3e170ce0
A
1391 max_proc = p;
1392 prev_max_proc = p;
1393
1394 while ((next_p = TAILQ_NEXT(p, p_memstat_list)) != NULL) {
1395 /* traversing list until we find next largest process */
1396 p=next_p;
fe8ab488 1397 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
fe8ab488
A
1398 if (pages > max_pages) {
1399 max_pages = pages;
1400 max_proc = p;
1401 }
fe8ab488
A
1402 }
1403
3e170ce0
A
1404 if (prev_max_proc != max_proc) {
1405 /* found a larger process, place it in the list */
fe8ab488 1406 TAILQ_REMOVE(&current_bucket->list, max_proc, p_memstat_list);
fe8ab488
A
1407 if (insert_after_proc == NULL) {
1408 TAILQ_INSERT_HEAD(&current_bucket->list, max_proc, p_memstat_list);
1409 } else {
1410 TAILQ_INSERT_AFTER(&current_bucket->list, insert_after_proc, max_proc, p_memstat_list);
1411 }
3e170ce0
A
1412 prev_max_proc = max_proc;
1413 }
fe8ab488 1414
3e170ce0 1415 insert_after_proc = max_proc;
fe8ab488 1416
3e170ce0 1417 p = TAILQ_NEXT(max_proc, p_memstat_list);
fe8ab488
A
1418 }
1419}
1420
39236c6e
A
1421static proc_t memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search) {
1422 memstat_bucket_t *current_bucket;
1423 proc_t next_p;
1424
1425 if ((*bucket_index) >= MEMSTAT_BUCKET_COUNT) {
1426 return NULL;
1427 }
1428
1429 current_bucket = &memstat_bucket[*bucket_index];
1430 next_p = TAILQ_FIRST(&current_bucket->list);
1431 if (!next_p && search) {
1432 while (!next_p && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
1433 current_bucket = &memstat_bucket[*bucket_index];
1434 next_p = TAILQ_FIRST(&current_bucket->list);
1435 }
1436 }
1437
1438 return next_p;
1439}
1440
1441static proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search) {
1442 memstat_bucket_t *current_bucket;
1443 proc_t next_p;
1444
1445 if (!p || ((*bucket_index) >= MEMSTAT_BUCKET_COUNT)) {
1446 return NULL;
1447 }
1448
1449 next_p = TAILQ_NEXT(p, p_memstat_list);
1450 while (!next_p && search && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
1451 current_bucket = &memstat_bucket[*bucket_index];
1452 next_p = TAILQ_FIRST(&current_bucket->list);
1453 }
1454
1455 return next_p;
1456}
316670eb
A
1457
1458__private_extern__ void
1459memorystatus_init(void)
1460{
1461 thread_t thread = THREAD_NULL;
1462 kern_return_t result;
39236c6e
A
1463 int i;
1464
fe8ab488
A
1465#if CONFIG_FREEZE
1466 memorystatus_freeze_pages_min = FREEZE_PAGES_MIN;
1467 memorystatus_freeze_pages_max = FREEZE_PAGES_MAX;
1468#endif
1469
39037602
A
1470#if DEVELOPMENT || DEBUG
1471 disconnect_page_mappings_lck_grp_attr = lck_grp_attr_alloc_init();
1472 disconnect_page_mappings_lck_grp = lck_grp_alloc_init("disconnect_page_mappings", disconnect_page_mappings_lck_grp_attr);
1473
1474 lck_mtx_init(&disconnect_page_mappings_mutex, disconnect_page_mappings_lck_grp, NULL);
5ba3f43e
A
1475
1476 if (kill_on_no_paging_space == TRUE) {
1477 max_kill_priority = JETSAM_PRIORITY_MAX;
1478 }
39037602
A
1479#endif
1480
39236c6e
A
1481
1482 /* Init buckets */
1483 for (i = 0; i < MEMSTAT_BUCKET_COUNT; i++) {
1484 TAILQ_INIT(&memstat_bucket[i].list);
1485 memstat_bucket[i].count = 0;
1486 }
39236c6e 1487 memorystatus_idle_demotion_call = thread_call_allocate((thread_call_func_t)memorystatus_perform_idle_demotion, NULL);
316670eb 1488
5ba3f43e
A
1489#if CONFIG_JETSAM
1490 nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time);
1491 nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time);
1492
39236c6e
A
1493 /* Apply overrides */
1494 PE_get_default("kern.jetsam_delta", &delta_percentage, sizeof(delta_percentage));
39037602
A
1495 if (delta_percentage == 0) {
1496 delta_percentage = 5;
1497 }
39236c6e
A
1498 assert(delta_percentage < 100);
1499 PE_get_default("kern.jetsam_critical_threshold", &critical_threshold_percentage, sizeof(critical_threshold_percentage));
1500 assert(critical_threshold_percentage < 100);
1501 PE_get_default("kern.jetsam_idle_offset", &idle_offset_percentage, sizeof(idle_offset_percentage));
1502 assert(idle_offset_percentage < 100);
1503 PE_get_default("kern.jetsam_pressure_threshold", &pressure_threshold_percentage, sizeof(pressure_threshold_percentage));
1504 assert(pressure_threshold_percentage < 100);
1505 PE_get_default("kern.jetsam_freeze_threshold", &freeze_threshold_percentage, sizeof(freeze_threshold_percentage));
1506 assert(freeze_threshold_percentage < 100);
316670eb 1507
39037602
A
1508 if (!PE_parse_boot_argn("jetsam_aging_policy", &jetsam_aging_policy,
1509 sizeof (jetsam_aging_policy))) {
1510
1511 if (!PE_get_default("kern.jetsam_aging_policy", &jetsam_aging_policy,
1512 sizeof(jetsam_aging_policy))) {
1513
1514 jetsam_aging_policy = kJetsamAgingPolicyLegacy;
1515 }
1516 }
1517
1518 if (jetsam_aging_policy > kJetsamAgingPolicyMax) {
1519 jetsam_aging_policy = kJetsamAgingPolicyLegacy;
1520 }
1521
1522 switch (jetsam_aging_policy) {
1523
1524 case kJetsamAgingPolicyNone:
1525 system_procs_aging_band = JETSAM_PRIORITY_IDLE;
1526 applications_aging_band = JETSAM_PRIORITY_IDLE;
1527 break;
1528
1529 case kJetsamAgingPolicyLegacy:
1530 /*
1531 * Legacy behavior where some daemons get a 10s protection once
1532 * AND only before the first clean->dirty->clean transition before
1533 * going into IDLE band.
1534 */
1535 system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
1536 applications_aging_band = JETSAM_PRIORITY_IDLE;
1537 break;
1538
1539 case kJetsamAgingPolicySysProcsReclaimedFirst:
1540 system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
1541 applications_aging_band = JETSAM_PRIORITY_AGING_BAND2;
1542 break;
1543
1544 case kJetsamAgingPolicyAppsReclaimedFirst:
1545 system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND2;
1546 applications_aging_band = JETSAM_PRIORITY_AGING_BAND1;
1547 break;
1548
1549 default:
1550 break;
1551 }
1552
1553 /*
1554 * The aging bands cannot overlap with the JETSAM_PRIORITY_ELEVATED_INACTIVE
1555 * band and must be below it in priority. This is so that we don't have to make
1556 * our 'aging' code worry about a mix of processes, some of which need to age
1557 * and some others that need to stay elevated in the jetsam bands.
1558 */
1559 assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > system_procs_aging_band);
1560 assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > applications_aging_band);
1561
39037602
A
1562 /* Take snapshots for idle-exit kills by default? First check the boot-arg... */
1563 if (!PE_parse_boot_argn("jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof (memorystatus_idle_snapshot))) {
1564 /* ...no boot-arg, so check the device tree */
1565 PE_get_default("kern.jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot));
1566 }
3e170ce0 1567
39236c6e 1568 memorystatus_delta = delta_percentage * atop_64(max_mem) / 100;
39236c6e 1569 memorystatus_available_pages_critical_idle_offset = idle_offset_percentage * atop_64(max_mem) / 100;
39236c6e 1570 memorystatus_available_pages_critical_base = (critical_threshold_percentage / delta_percentage) * memorystatus_delta;
39037602 1571 memorystatus_policy_more_free_offset_pages = (policy_more_free_offset_percentage / delta_percentage) * memorystatus_delta;
39236c6e 1572
5ba3f43e
A
1573 /* Jetsam Loop Detection */
1574 if (max_mem <= (512 * 1024 * 1024)) {
1575 /* 512 MB devices */
1576 memorystatus_jld_eval_period_msecs = 8000; /* 8000 msecs == 8 second window */
1577 } else {
1578 /* 1GB and larger devices */
1579 memorystatus_jld_eval_period_msecs = 6000; /* 6000 msecs == 6 second window */
1580 }
1581
1582 memorystatus_jld_enabled = TRUE;
1583
1584 /* No contention at this point */
1585 memorystatus_update_levels_locked(FALSE);
1586
1587#endif /* CONFIG_JETSAM */
1588
39236c6e
A
1589 memorystatus_jetsam_snapshot_max = maxproc;
1590 memorystatus_jetsam_snapshot =
1591 (memorystatus_jetsam_snapshot_t*)kalloc(sizeof(memorystatus_jetsam_snapshot_t) +
1592 sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max);
1593 if (!memorystatus_jetsam_snapshot) {
1594 panic("Could not allocate memorystatus_jetsam_snapshot");
1595 }
1596
3e170ce0
A
1597 nanoseconds_to_absolutetime((uint64_t)JETSAM_SNAPSHOT_TIMEOUT_SECS * NSEC_PER_SEC, &memorystatus_jetsam_snapshot_timeout);
1598
1599 memset(&memorystatus_at_boot_snapshot, 0, sizeof(memorystatus_jetsam_snapshot_t));
1600
316670eb 1601#if CONFIG_FREEZE
39236c6e 1602 memorystatus_freeze_threshold = (freeze_threshold_percentage / delta_percentage) * memorystatus_delta;
316670eb 1603#endif
39236c6e
A
1604
1605 result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &thread);
316670eb
A
1606 if (result == KERN_SUCCESS) {
1607 thread_deallocate(thread);
1608 } else {
1609 panic("Could not create memorystatus_thread");
1610 }
39236c6e 1611}
316670eb 1612
39236c6e
A
1613/* Centralised for the purposes of allowing panic-on-jetsam */
1614extern void
39037602 1615vm_run_compactor(void);
316670eb 1616
fe8ab488
A
1617/*
1618 * The jetsam no frills kill call
1619 * Return: 0 on success
1620 * error code on failure (EINVAL...)
1621 */
1622static int
39037602 1623jetsam_do_kill(proc_t p, int jetsam_flags, os_reason_t jetsam_reason) {
fe8ab488 1624 int error = 0;
39037602 1625 error = exit_with_reason(p, W_EXITCODE(0, SIGKILL), (int *)NULL, FALSE, FALSE, jetsam_flags, jetsam_reason);
fe8ab488
A
1626 return(error);
1627}
1628
1629/*
1630 * Wrapper for processes exiting with memorystatus details
1631 */
39236c6e 1632static boolean_t
39037602 1633memorystatus_do_kill(proc_t p, uint32_t cause, os_reason_t jetsam_reason) {
39236c6e 1634
fe8ab488
A
1635 int error = 0;
1636 __unused pid_t victim_pid = p->p_pid;
1637
1638 KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_START,
1639 victim_pid, cause, vm_page_free_count, 0, 0);
39236c6e 1640
39037602 1641 DTRACE_MEMORYSTATUS3(memorystatus_do_kill, proc_t, p, os_reason_t, jetsam_reason, uint32_t, cause);
39236c6e
A
1642#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
1643 if (memorystatus_jetsam_panic_debug & (1 << cause)) {
1644 panic("memorystatus_do_kill(): jetsam debug panic (cause: %d)", cause);
316670eb 1645 }
39236c6e
A
1646#else
1647#pragma unused(cause)
316670eb 1648#endif
5ba3f43e
A
1649
1650 if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) {
1651 printf("memorystatus: killing process %d [%s] in high band %s (%d) - memorystatus_available_pages: %llu\n", p->p_pid,
1652 (*p->p_name ? p->p_name : "unknown"),
1653 memorystatus_priority_band_name(p->p_memstat_effectivepriority), p->p_memstat_effectivepriority,
1654 (uint64_t)memorystatus_available_pages);
1655 }
1656
39236c6e
A
1657 int jetsam_flags = P_LTERM_JETSAM;
1658 switch (cause) {
1659 case kMemorystatusKilledHiwat: jetsam_flags |= P_JETSAM_HIWAT; break;
1660 case kMemorystatusKilledVnodes: jetsam_flags |= P_JETSAM_VNODE; break;
1661 case kMemorystatusKilledVMPageShortage: jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break;
1662 case kMemorystatusKilledVMThrashing: jetsam_flags |= P_JETSAM_VMTHRASHING; break;
fe8ab488 1663 case kMemorystatusKilledFCThrashing: jetsam_flags |= P_JETSAM_FCTHRASHING; break;
39236c6e
A
1664 case kMemorystatusKilledPerProcessLimit: jetsam_flags |= P_JETSAM_PID; break;
1665 case kMemorystatusKilledIdleExit: jetsam_flags |= P_JETSAM_IDLEEXIT; break;
1666 }
39037602 1667 error = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
fe8ab488
A
1668
1669 KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_END,
1670 victim_pid, cause, vm_page_free_count, error, 0);
39236c6e 1671
39037602 1672 vm_run_compactor();
fe8ab488
A
1673
1674 return (error == 0);
316670eb
A
1675}
1676
1677/*
1678 * Node manipulation
1679 */
1680
1681static void
39236c6e
A
1682memorystatus_check_levels_locked(void) {
1683#if CONFIG_JETSAM
1684 /* Update levels */
1685 memorystatus_update_levels_locked(TRUE);
5ba3f43e
A
1686#else /* CONFIG_JETSAM */
1687 /*
1688 * Nothing to do here currently since we update
1689 * memorystatus_available_pages in vm_pressure_response.
1690 */
1691#endif /* CONFIG_JETSAM */
39236c6e 1692}
316670eb 1693
39037602
A
1694/*
1695 * Pin a process to a particular jetsam band when it is in the background i.e. not doing active work.
1696 * For an application: that means no longer in the FG band
1697 * For a daemon: that means no longer in its 'requested' jetsam priority band
1698 */
1699
1700int
1701memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags, boolean_t effective_now)
1702{
1703 int error = 0;
1704 boolean_t enable = FALSE;
1705 proc_t p = NULL;
1706
1707 if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE) {
1708 enable = TRUE;
1709 } else if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE) {
1710 enable = FALSE;
1711 } else {
1712 return EINVAL;
1713 }
1714
1715 p = proc_find(pid);
1716 if (p != NULL) {
1717
1718 if ((enable && ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) == P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) ||
1719 (!enable && ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) == 0))) {
1720 /*
1721 * No change in state.
1722 */
1723
1724 } else {
1725
1726 proc_list_lock();
1727
1728 if (enable) {
1729 p->p_memstat_state |= P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
1730 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1731
1732 if (effective_now) {
1733 if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_ELEVATED_INACTIVE) {
813fb2f6
A
1734 if(memorystatus_highwater_enabled) {
1735 /*
1736 * Process is about to transition from
1737 * inactive --> active
1738 * assign active state
1739 */
1740 boolean_t is_fatal;
1741 boolean_t use_active = TRUE;
1742 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
1743 task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, use_active, is_fatal);
1744 }
39037602
A
1745 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_ELEVATED_INACTIVE, FALSE, FALSE);
1746 }
1747 } else {
1748 if (isProcessInAgingBands(p)) {
1749 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
1750 }
1751 }
1752 } else {
1753
1754 p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
1755 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1756
1757 if (effective_now) {
1758 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_ELEVATED_INACTIVE) {
1759 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
1760 }
1761 } else {
1762 if (isProcessInAgingBands(p)) {
1763 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
1764 }
1765 }
1766 }
1767
1768 proc_list_unlock();
1769 }
1770 proc_rele(p);
1771 error = 0;
1772
1773 } else {
1774 error = ESRCH;
1775 }
1776
1777 return error;
1778}
1779
39236c6e
A
1780static void
1781memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2)
1782{
1783 proc_t p;
39037602
A
1784 uint64_t current_time = 0, idle_delay_time = 0;
1785 int demote_prio_band = 0;
39236c6e
A
1786 memstat_bucket_t *demotion_bucket;
1787
1788 MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion()\n");
1789
1790 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_START, 0, 0, 0, 0, 0);
1791
1792 current_time = mach_absolute_time();
1793
1794 proc_list_lock();
316670eb 1795
39037602
A
1796 demote_prio_band = JETSAM_PRIORITY_IDLE + 1;
1797
1798 for (; demote_prio_band < JETSAM_PRIORITY_MAX; demote_prio_band++) {
1799
1800 if (demote_prio_band != system_procs_aging_band && demote_prio_band != applications_aging_band)
39236c6e 1801 continue;
39037602
A
1802
1803 demotion_bucket = &memstat_bucket[demote_prio_band];
1804 p = TAILQ_FIRST(&demotion_bucket->list);
1805
1806 while (p) {
1807 MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion() found %d\n", p->p_pid);
1808
1809 assert(p->p_memstat_idledeadline);
1810
1811 assert(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS);
1812
1813 if (current_time >= p->p_memstat_idledeadline) {
1814
1815 if ((isSysProc(p) &&
1816 ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_IS_DIRTY)) != P_DIRTY_IDLE_EXIT_ENABLED)) || /* system proc marked dirty*/
1817 task_has_assertions((struct task *)(p->task))) { /* has outstanding assertions which might indicate outstanding work too */
1818 idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_delay_time : memorystatus_apps_idle_delay_time;
1819
1820 p->p_memstat_idledeadline += idle_delay_time;
1821 p = TAILQ_NEXT(p, p_memstat_list);
1822
1823 } else {
1824
1825 proc_t next_proc = NULL;
1826
1827 next_proc = TAILQ_NEXT(p, p_memstat_list);
1828 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1829
1830 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, false, true);
1831
1832 p = next_proc;
1833 continue;
1834
1835 }
1836 } else {
1837 // No further candidates
1838 break;
1839 }
316670eb 1840 }
39037602 1841
316670eb 1842 }
39037602 1843
39236c6e
A
1844 memorystatus_reschedule_idle_demotion_locked();
1845
1846 proc_list_unlock();
316670eb 1847
39236c6e 1848 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
316670eb
A
1849}
1850
1851static void
39236c6e
A
1852memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state)
1853{
39037602
A
1854 boolean_t present_in_sysprocs_aging_bucket = FALSE;
1855 boolean_t present_in_apps_aging_bucket = FALSE;
1856 uint64_t idle_delay_time = 0;
1857
1858 if (jetsam_aging_policy == kJetsamAgingPolicyNone) {
1859 return;
1860 }
1861
1862 if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) {
1863 /*
1864 * This process isn't going to be making the trip to the lower bands.
1865 */
1866 return;
fe8ab488
A
1867 }
1868
39037602
A
1869 if (isProcessInAgingBands(p)){
1870
1871 if (jetsam_aging_policy != kJetsamAgingPolicyLegacy) {
1872 assert((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) != P_DIRTY_AGING_IN_PROGRESS);
1873 }
1874
1875 if (isSysProc(p) && system_procs_aging_band) {
1876 present_in_sysprocs_aging_bucket = TRUE;
1877
1878 } else if (isApp(p) && applications_aging_band) {
1879 present_in_apps_aging_bucket = TRUE;
1880 }
1881 }
1882
1883 assert(!present_in_sysprocs_aging_bucket);
1884 assert(!present_in_apps_aging_bucket);
1885
3e170ce0 1886 MEMORYSTATUS_DEBUG(1, "memorystatus_schedule_idle_demotion_locked: scheduling demotion to idle band for pid %d (dirty:0x%x, set_state %d, demotions %d).\n",
39037602 1887 p->p_pid, p->p_memstat_dirty, set_state, (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps));
316670eb 1888
39037602
A
1889 if(isSysProc(p)) {
1890 assert((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED);
1891 }
1892
1893 idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_delay_time : memorystatus_apps_idle_delay_time;
316670eb 1894
39236c6e 1895 if (set_state) {
39037602
A
1896 p->p_memstat_dirty |= P_DIRTY_AGING_IN_PROGRESS;
1897 p->p_memstat_idledeadline = mach_absolute_time() + idle_delay_time;
316670eb 1898 }
39236c6e 1899
fe8ab488 1900 assert(p->p_memstat_idledeadline);
39236c6e 1901
39037602
A
1902 if (isSysProc(p) && present_in_sysprocs_aging_bucket == FALSE) {
1903 memorystatus_scheduled_idle_demotions_sysprocs++;
1904
1905 } else if (isApp(p) && present_in_apps_aging_bucket == FALSE) {
1906 memorystatus_scheduled_idle_demotions_apps++;
fe8ab488 1907 }
316670eb
A
1908}
1909
39236c6e
A
1910static void
1911memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clear_state)
316670eb 1912{
39037602
A
1913 boolean_t present_in_sysprocs_aging_bucket = FALSE;
1914 boolean_t present_in_apps_aging_bucket = FALSE;
1915
1916 if (!system_procs_aging_band && !applications_aging_band) {
1917 return;
1918 }
1919
1920 if ((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) == 0) {
1921 return;
1922 }
1923
1924 if (isProcessInAgingBands(p)) {
1925
1926 if (jetsam_aging_policy != kJetsamAgingPolicyLegacy) {
1927 assert((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) == P_DIRTY_AGING_IN_PROGRESS);
1928 }
1929
1930 if (isSysProc(p) && system_procs_aging_band) {
1931 assert(p->p_memstat_effectivepriority == system_procs_aging_band);
1932 assert(p->p_memstat_idledeadline);
1933 present_in_sysprocs_aging_bucket = TRUE;
1934
1935 } else if (isApp(p) && applications_aging_band) {
1936 assert(p->p_memstat_effectivepriority == applications_aging_band);
1937 assert(p->p_memstat_idledeadline);
1938 present_in_apps_aging_bucket = TRUE;
1939 }
fe8ab488
A
1940 }
1941
3e170ce0 1942 MEMORYSTATUS_DEBUG(1, "memorystatus_invalidate_idle_demotion(): invalidating demotion to idle band for pid %d (clear_state %d, demotions %d).\n",
39037602 1943 p->p_pid, clear_state, (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps));
39236c6e 1944
39236c6e
A
1945
1946 if (clear_state) {
1947 p->p_memstat_idledeadline = 0;
39037602 1948 p->p_memstat_dirty &= ~P_DIRTY_AGING_IN_PROGRESS;
316670eb 1949 }
39236c6e 1950
39037602
A
1951 if (isSysProc(p) &&present_in_sysprocs_aging_bucket == TRUE) {
1952 memorystatus_scheduled_idle_demotions_sysprocs--;
1953 assert(memorystatus_scheduled_idle_demotions_sysprocs >= 0);
1954
1955 } else if (isApp(p) && present_in_apps_aging_bucket == TRUE) {
1956 memorystatus_scheduled_idle_demotions_apps--;
1957 assert(memorystatus_scheduled_idle_demotions_apps >= 0);
fe8ab488
A
1958 }
1959
39037602 1960 assert((memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps) >= 0);
316670eb
A
1961}
1962
1963static void
39236c6e 1964memorystatus_reschedule_idle_demotion_locked(void) {
39037602 1965 if (0 == (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps)) {
39236c6e
A
1966 if (memstat_idle_demotion_deadline) {
1967 /* Transitioned 1->0, so cancel next call */
1968 thread_call_cancel(memorystatus_idle_demotion_call);
1969 memstat_idle_demotion_deadline = 0;
1970 }
1971 } else {
1972 memstat_bucket_t *demotion_bucket;
39037602
A
1973 proc_t p = NULL, p1 = NULL, p2 = NULL;
1974
1975 if (system_procs_aging_band) {
1976
1977 demotion_bucket = &memstat_bucket[system_procs_aging_band];
1978 p1 = TAILQ_FIRST(&demotion_bucket->list);
1979
1980 p = p1;
1981 }
1982
1983 if (applications_aging_band) {
1984
1985 demotion_bucket = &memstat_bucket[applications_aging_band];
1986 p2 = TAILQ_FIRST(&demotion_bucket->list);
1987
1988 if (p1 && p2) {
1989 p = (p1->p_memstat_idledeadline > p2->p_memstat_idledeadline) ? p2 : p1;
1990 } else {
1991 p = (p1 == NULL) ? p2 : p1;
1992 }
1993
1994 }
1995
1996 assert(p);
1997
1998 if (p != NULL) {
1999 assert(p && p->p_memstat_idledeadline);
2000 if (memstat_idle_demotion_deadline != p->p_memstat_idledeadline){
2001 thread_call_enter_delayed(memorystatus_idle_demotion_call, p->p_memstat_idledeadline);
2002 memstat_idle_demotion_deadline = p->p_memstat_idledeadline;
2003 }
39236c6e
A
2004 }
2005 }
316670eb
A
2006}
2007
2008/*
2009 * List manipulation
2010 */
2011
39236c6e
A
2012int
2013memorystatus_add(proc_t p, boolean_t locked)
316670eb 2014{
39236c6e
A
2015 memstat_bucket_t *bucket;
2016
3e170ce0 2017 MEMORYSTATUS_DEBUG(1, "memorystatus_list_add(): adding pid %d with priority %d.\n", p->p_pid, p->p_memstat_effectivepriority);
39037602 2018
39236c6e
A
2019 if (!locked) {
2020 proc_list_lock();
2021 }
39037602
A
2022
2023 DTRACE_MEMORYSTATUS2(memorystatus_add, proc_t, p, int32_t, p->p_memstat_effectivepriority);
2024
39236c6e
A
2025 /* Processes marked internal do not have priority tracked */
2026 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
2027 goto exit;
2028 }
2029
2030 bucket = &memstat_bucket[p->p_memstat_effectivepriority];
2031
39037602
A
2032 if (isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) {
2033 assert(bucket->count == memorystatus_scheduled_idle_demotions_sysprocs - 1);
2034
2035 } else if (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)) {
2036 assert(bucket->count == memorystatus_scheduled_idle_demotions_apps - 1);
2037
2038 } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2039 /*
2040 * Entering the idle band.
2041 * Record idle start time.
2042 */
2043 p->p_memstat_idle_start = mach_absolute_time();
fe8ab488
A
2044 }
2045
39236c6e
A
2046 TAILQ_INSERT_TAIL(&bucket->list, p, p_memstat_list);
2047 bucket->count++;
316670eb 2048
39236c6e 2049 memorystatus_list_count++;
316670eb 2050
39236c6e
A
2051 memorystatus_check_levels_locked();
2052
2053exit:
2054 if (!locked) {
2055 proc_list_unlock();
2056 }
2057
2058 return 0;
2059}
316670eb 2060
3e170ce0
A
2061/*
2062 * Description:
2063 * Moves a process from one jetsam bucket to another.
2064 * which changes the LRU position of the process.
2065 *
2066 * Monitors transition between buckets and if necessary
2067 * will update cached memory limits accordingly.
39037602
A
2068 *
2069 * skip_demotion_check:
2070 * - if the 'jetsam aging policy' is NOT 'legacy':
2071 * When this flag is TRUE, it means we are going
2072 * to age the ripe processes out of the aging bands and into the
2073 * IDLE band and apply their inactive memory limits.
2074 *
2075 * - if the 'jetsam aging policy' is 'legacy':
2076 * When this flag is TRUE, it might mean the above aging mechanism
2077 * OR
2078 * It might be that we have a process that has used up its 'idle deferral'
2079 * stay that is given to it once per lifetime. And in this case, the process
2080 * won't be going through any aging codepaths. But we still need to apply
2081 * the right inactive limits and so we explicitly set this to TRUE if the
2082 * new priority for the process is the IDLE band.
3e170ce0 2083 */
39037602
A
2084void
2085memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert, boolean_t skip_demotion_check)
39236c6e
A
2086{
2087 memstat_bucket_t *old_bucket, *new_bucket;
2088
2089 assert(priority < MEMSTAT_BUCKET_COUNT);
2090
2091 /* Ensure that exit isn't underway, leaving the proc retained but removed from its bucket */
2092 if ((p->p_listflag & P_LIST_EXITED) != 0) {
2093 return;
316670eb 2094 }
39037602
A
2095
2096 MEMORYSTATUS_DEBUG(1, "memorystatus_update_priority_locked(): setting %s(%d) to priority %d, inserting at %s\n",
2097 (*p->p_name ? p->p_name : "unknown"), p->p_pid, priority, head_insert ? "head" : "tail");
2098
2099 DTRACE_MEMORYSTATUS3(memorystatus_update_priority, proc_t, p, int32_t, p->p_memstat_effectivepriority, int, priority);
2100
2101#if DEVELOPMENT || DEBUG
2102 if (priority == JETSAM_PRIORITY_IDLE && /* if the process is on its way into the IDLE band */
2103 skip_demotion_check == FALSE && /* and it isn't via the path that will set the INACTIVE memlimits */
2104 (p->p_memstat_dirty & P_DIRTY_TRACK) && /* and it has 'DIRTY' tracking enabled */
2105 ((p->p_memstat_memlimit != p->p_memstat_memlimit_inactive) || /* and we notice that the current limit isn't the right value (inactive) */
2106 ((p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) ? ( ! (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT)) : (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT)))) /* OR type (fatal vs non-fatal) */
2107 panic("memorystatus_update_priority_locked: on %s with 0x%x, prio: %d and %d\n", p->p_name, p->p_memstat_state, priority, p->p_memstat_memlimit); /* then we must catch this */
2108#endif /* DEVELOPMENT || DEBUG */
316670eb 2109
39236c6e 2110 old_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
39037602
A
2111
2112 if (skip_demotion_check == FALSE) {
2113
2114 if (isSysProc(p)) {
2115 /*
2116 * For system processes, the memorystatus_dirty_* routines take care of adding/removing
2117 * the processes from the aging bands and balancing the demotion counts.
2118 * We can, however, override that if the process has an 'elevated inactive jetsam band' attribute.
2119 */
2120
2121 if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE && (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) {
2122 priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
2123
2124 assert(! (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS));
2125 }
2126 } else if (isApp(p)) {
2127
2128 /*
2129 * Check to see if the application is being lowered in jetsam priority. If so, and:
2130 * - it has an 'elevated inactive jetsam band' attribute, then put it in the JETSAM_PRIORITY_ELEVATED_INACTIVE band.
2131 * - it is a normal application, then let it age in the aging band if that policy is in effect.
2132 */
2133
2134 if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE && (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) {
2135 priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
2136 } else {
2137
2138 if (applications_aging_band) {
2139 if (p->p_memstat_effectivepriority == applications_aging_band) {
2140 assert(old_bucket->count == (memorystatus_scheduled_idle_demotions_apps + 1));
2141 }
2142
2143 if ((jetsam_aging_policy != kJetsamAgingPolicyLegacy) && (priority <= applications_aging_band)) {
2144 assert(! (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS));
2145 priority = applications_aging_band;
2146 memorystatus_schedule_idle_demotion_locked(p, TRUE);
2147 }
2148 }
2149 }
2150 }
2151 }
2152
2153 if ((system_procs_aging_band && (priority == system_procs_aging_band)) || (applications_aging_band && (priority == applications_aging_band))) {
2154 assert(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS);
fe8ab488
A
2155 }
2156
39236c6e
A
2157 TAILQ_REMOVE(&old_bucket->list, p, p_memstat_list);
2158 old_bucket->count--;
39037602 2159
39236c6e 2160 new_bucket = &memstat_bucket[priority];
fe8ab488
A
2161 if (head_insert)
2162 TAILQ_INSERT_HEAD(&new_bucket->list, p, p_memstat_list);
2163 else
2164 TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
39236c6e 2165 new_bucket->count++;
3e170ce0 2166
3e170ce0 2167 if (memorystatus_highwater_enabled) {
813fb2f6
A
2168 boolean_t is_fatal;
2169 boolean_t use_active;
3e170ce0
A
2170
2171 /*
2172 * If cached limit data is updated, then the limits
2173 * will be enforced by writing to the ledgers.
2174 */
2175 boolean_t ledger_update_needed = TRUE;
fe8ab488
A
2176
2177 /*
3e170ce0
A
2178 * Here, we must update the cached memory limit if the task
2179 * is transitioning between:
2180 * active <--> inactive
2181 * FG <--> BG
2182 * but:
2183 * dirty <--> clean is ignored
2184 *
39037602 2185 * We bypass non-idle processes that have opted into dirty tracking because
3e170ce0
A
2186 * a move between buckets does not imply a transition between the
2187 * dirty <--> clean state.
fe8ab488
A
2188 */
2189
3e170ce0
A
2190 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
2191
39037602 2192 if (skip_demotion_check == TRUE && priority == JETSAM_PRIORITY_IDLE) {
813fb2f6
A
2193 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
2194 use_active = FALSE;
39037602
A
2195 } else {
2196 ledger_update_needed = FALSE;
2197 }
3e170ce0
A
2198
2199 } else if ((priority >= JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority < JETSAM_PRIORITY_FOREGROUND)) {
2200 /*
2201 * inactive --> active
2202 * BG --> FG
2203 * assign active state
2204 */
813fb2f6
A
2205 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
2206 use_active = TRUE;
3e170ce0
A
2207
2208 } else if ((priority < JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) {
2209 /*
2210 * active --> inactive
2211 * FG --> BG
2212 * assign inactive state
2213 */
813fb2f6
A
2214 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
2215 use_active = FALSE;
3e170ce0
A
2216 } else {
2217 /*
2218 * The transition between jetsam priority buckets apparently did
2219 * not affect active/inactive state.
2220 * This is not unusual... especially during startup when
2221 * processes are getting established in their respective bands.
2222 */
2223 ledger_update_needed = FALSE;
2224 }
2225
2226 /*
2227 * Enforce the new limits by writing to the ledger
2228 */
2229 if (ledger_update_needed) {
813fb2f6 2230 task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, use_active, is_fatal);
3e170ce0
A
2231
2232 MEMORYSTATUS_DEBUG(3, "memorystatus_update_priority_locked: new limit on pid %d (%dMB %s) priority old --> new (%d --> %d) dirty?=0x%x %s\n",
2233 p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
2234 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, priority, p->p_memstat_dirty,
2235 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
39236c6e
A
2236 }
2237 }
3e170ce0 2238
39037602
A
2239 /*
2240 * Record idle start or idle delta.
2241 */
2242 if (p->p_memstat_effectivepriority == priority) {
2243 /*
2244 * This process is not transitioning between
2245 * jetsam priority buckets. Do nothing.
2246 */
2247 } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2248 uint64_t now;
2249 /*
2250 * Transitioning out of the idle priority bucket.
2251 * Record idle delta.
2252 */
2253 assert(p->p_memstat_idle_start != 0);
2254 now = mach_absolute_time();
2255 if (now > p->p_memstat_idle_start) {
2256 p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
2257 }
2258 } else if (priority == JETSAM_PRIORITY_IDLE) {
2259 /*
2260 * Transitioning into the idle priority bucket.
2261 * Record idle start.
2262 */
2263 p->p_memstat_idle_start = mach_absolute_time();
2264 }
2265
39236c6e 2266 p->p_memstat_effectivepriority = priority;
39037602
A
2267
2268#if CONFIG_SECLUDED_MEMORY
2269 if (secluded_for_apps &&
2270 task_could_use_secluded_mem(p->task)) {
2271 task_set_can_use_secluded_mem(
2272 p->task,
2273 (priority >= JETSAM_PRIORITY_FOREGROUND));
2274 }
2275#endif /* CONFIG_SECLUDED_MEMORY */
39236c6e
A
2276
2277 memorystatus_check_levels_locked();
316670eb
A
2278}
2279
3e170ce0
A
2280/*
2281 *
2282 * Description: Update the jetsam priority and memory limit attributes for a given process.
2283 *
2284 * Parameters:
2285 * p init this process's jetsam information.
2286 * priority The jetsam priority band
2287 * user_data user specific data, unused by the kernel
2288 * effective guards against race if process's update already occurred
2289 * update_memlimit When true we know this is the init step via the posix_spawn path.
2290 *
2291 * memlimit_active Value in megabytes; The monitored footprint level while the
2292 * process is active. Exceeding it may result in termination
2293 * based on it's associated fatal flag.
2294 *
2295 * memlimit_active_is_fatal When a process is active and exceeds its memory footprint,
2296 * this describes whether or not it should be immediately fatal.
2297 *
2298 * memlimit_inactive Value in megabytes; The monitored footprint level while the
2299 * process is inactive. Exceeding it may result in termination
2300 * based on it's associated fatal flag.
2301 *
2302 * memlimit_inactive_is_fatal When a process is inactive and exceeds its memory footprint,
2303 * this describes whether or not it should be immediatly fatal.
2304 *
3e170ce0
A
2305 * Returns: 0 Success
2306 * non-0 Failure
2307 */
2308
39236c6e 2309int
3e170ce0
A
2310memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t effective, boolean_t update_memlimit,
2311 int32_t memlimit_active, boolean_t memlimit_active_is_fatal,
5ba3f43e 2312 int32_t memlimit_inactive, boolean_t memlimit_inactive_is_fatal)
316670eb 2313{
39236c6e 2314 int ret;
fe8ab488 2315 boolean_t head_insert = false;
3e170ce0 2316
39037602 2317 MEMORYSTATUS_DEBUG(1, "memorystatus_update: changing (%s) pid %d: priority %d, user_data 0x%llx\n", (*p->p_name ? p->p_name : "unknown"), p->p_pid, priority, user_data);
316670eb 2318
39236c6e
A
2319 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_START, p->p_pid, priority, user_data, effective, 0);
2320
2321 if (priority == -1) {
2322 /* Use as shorthand for default priority */
2323 priority = JETSAM_PRIORITY_DEFAULT;
39037602
A
2324 } else if ((priority == system_procs_aging_band) || (priority == applications_aging_band)) {
2325 /* Both the aging bands are reserved for internal use; if requested, adjust to JETSAM_PRIORITY_IDLE. */
39236c6e 2326 priority = JETSAM_PRIORITY_IDLE;
fe8ab488
A
2327 } else if (priority == JETSAM_PRIORITY_IDLE_HEAD) {
2328 /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle queue */
2329 priority = JETSAM_PRIORITY_IDLE;
3e170ce0 2330 head_insert = TRUE;
39236c6e
A
2331 } else if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) {
2332 /* Sanity check */
2333 ret = EINVAL;
2334 goto out;
316670eb 2335 }
3e170ce0 2336
39236c6e
A
2337 proc_list_lock();
2338
2339 assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
316670eb 2340
39236c6e
A
2341 if (effective && (p->p_memstat_state & P_MEMSTAT_PRIORITYUPDATED)) {
2342 ret = EALREADY;
2343 proc_list_unlock();
fe8ab488
A
2344 MEMORYSTATUS_DEBUG(1, "memorystatus_update: effective change specified for pid %d, but change already occurred.\n", p->p_pid);
2345 goto out;
2346 }
2347
2348 if ((p->p_memstat_state & P_MEMSTAT_TERMINATED) || ((p->p_listflag & P_LIST_EXITED) != 0)) {
2349 /*
2350 * This could happen when a process calling posix_spawn() is exiting on the jetsam thread.
2351 */
2352 ret = EBUSY;
2353 proc_list_unlock();
316670eb
A
2354 goto out;
2355 }
2356
39236c6e
A
2357 p->p_memstat_state |= P_MEMSTAT_PRIORITYUPDATED;
2358 p->p_memstat_userdata = user_data;
2359 p->p_memstat_requestedpriority = priority;
39037602 2360
39236c6e 2361 if (update_memlimit) {
813fb2f6
A
2362 boolean_t is_fatal;
2363 boolean_t use_active;
3e170ce0
A
2364
2365 /*
2366 * Posix_spawn'd processes come through this path to instantiate ledger limits.
2367 * Forked processes do not come through this path, so no ledger limits exist.
2368 * (That's why forked processes can consume unlimited memory.)
2369 */
2370
2371 MEMORYSTATUS_DEBUG(3, "memorystatus_update(enter): pid %d, priority %d, dirty=0x%x, Active(%dMB %s), Inactive(%dMB, %s)\n",
2372 p->p_pid, priority, p->p_memstat_dirty,
2373 memlimit_active, (memlimit_active_is_fatal ? "F " : "NF"),
2374 memlimit_inactive, (memlimit_inactive_is_fatal ? "F " : "NF"));
2375
3e170ce0
A
2376 if (memlimit_active <= 0) {
2377 /*
2378 * This process will have a system_wide task limit when active.
2379 * System_wide task limit is always fatal.
2380 * It's quite common to see non-fatal flag passed in here.
2381 * It's not an error, we just ignore it.
2382 */
2383
2384 /*
2385 * For backward compatibility with some unexplained launchd behavior,
2386 * we allow a zero sized limit. But we still enforce system_wide limit
2387 * when written to the ledgers.
2388 */
2389
2390 if (memlimit_active < 0) {
2391 memlimit_active = -1; /* enforces system_wide task limit */
39236c6e 2392 }
3e170ce0 2393 memlimit_active_is_fatal = TRUE;
316670eb 2394 }
3e170ce0
A
2395
2396 if (memlimit_inactive <= 0) {
2397 /*
2398 * This process will have a system_wide task limit when inactive.
2399 * System_wide task limit is always fatal.
2400 */
2401
2402 memlimit_inactive = -1;
2403 memlimit_inactive_is_fatal = TRUE;
fe8ab488 2404 }
316670eb 2405
3e170ce0
A
2406 /*
2407 * Initialize the active limit variants for this process.
2408 */
2409 SET_ACTIVE_LIMITS_LOCKED(p, memlimit_active, memlimit_active_is_fatal);
2410
2411 /*
2412 * Initialize the inactive limit variants for this process.
2413 */
2414 SET_INACTIVE_LIMITS_LOCKED(p, memlimit_inactive, memlimit_inactive_is_fatal);
2415
2416 /*
2417 * Initialize the cached limits for target process.
2418 * When the target process is dirty tracked, it's typically
2419 * in a clean state. Non dirty tracked processes are
2420 * typically active (Foreground or above).
2421 * But just in case, we don't make assumptions...
2422 */
2423
2424 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
813fb2f6
A
2425 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
2426 use_active = TRUE;
3e170ce0 2427 } else {
813fb2f6
A
2428 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
2429 use_active = FALSE;
3e170ce0
A
2430 }
2431
2432 /*
2433 * Enforce the cached limit by writing to the ledger.
2434 */
2435 if (memorystatus_highwater_enabled) {
2436 /* apply now */
813fb2f6 2437 task_set_phys_footprint_limit_internal(p->task, ((p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1), NULL, use_active, is_fatal);
3e170ce0
A
2438
2439 MEMORYSTATUS_DEBUG(3, "memorystatus_update: init: limit on pid %d (%dMB %s) targeting priority(%d) dirty?=0x%x %s\n",
2440 p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
2441 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), priority, p->p_memstat_dirty,
2442 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
2443 }
2444 }
3e170ce0
A
2445
2446 /*
39037602
A
2447 * We can't add to the aging bands buckets here.
2448 * But, we could be removing it from those buckets.
3e170ce0
A
2449 * Check and take appropriate steps if so.
2450 */
2451
39037602 2452 if (isProcessInAgingBands(p)) {
3e170ce0 2453
fe8ab488 2454 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
39037602
A
2455 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
2456 } else {
2457 if (jetsam_aging_policy == kJetsamAgingPolicyLegacy && priority == JETSAM_PRIORITY_IDLE) {
2458 /*
2459 * Daemons with 'inactive' limits will go through the dirty tracking codepath.
2460 * This path deals with apps that may have 'inactive' limits e.g. WebContent processes.
2461 * If this is the legacy aging policy we explicitly need to apply those limits. If it
2462 * is any other aging policy, then we don't need to worry because all processes
2463 * will go through the aging bands and then the demotion thread will take care to
2464 * move them into the IDLE band and apply the required limits.
2465 */
2466 memorystatus_update_priority_locked(p, priority, head_insert, TRUE);
2467 }
fe8ab488 2468 }
39037602
A
2469
2470 memorystatus_update_priority_locked(p, priority, head_insert, FALSE);
2471
39236c6e
A
2472 proc_list_unlock();
2473 ret = 0;
316670eb
A
2474
2475out:
39236c6e
A
2476 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_END, ret, 0, 0, 0, 0);
2477
316670eb
A
2478 return ret;
2479}
2480
39236c6e
A
2481int
2482memorystatus_remove(proc_t p, boolean_t locked)
316670eb 2483{
39236c6e
A
2484 int ret;
2485 memstat_bucket_t *bucket;
39037602 2486 boolean_t reschedule = FALSE;
316670eb 2487
3e170ce0 2488 MEMORYSTATUS_DEBUG(1, "memorystatus_list_remove: removing pid %d\n", p->p_pid);
316670eb 2489
39236c6e
A
2490 if (!locked) {
2491 proc_list_lock();
2492 }
316670eb 2493
39236c6e 2494 assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
fe8ab488 2495
39236c6e 2496 bucket = &memstat_bucket[p->p_memstat_effectivepriority];
39037602
A
2497
2498 if (isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) {
2499
2500 assert(bucket->count == memorystatus_scheduled_idle_demotions_sysprocs);
2501 reschedule = TRUE;
2502
2503 } else if (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)) {
2504
2505 assert(bucket->count == memorystatus_scheduled_idle_demotions_apps);
2506 reschedule = TRUE;
2507 }
2508
2509 /*
2510 * Record idle delta
2511 */
2512
2513 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2514 uint64_t now = mach_absolute_time();
2515 if (now > p->p_memstat_idle_start) {
2516 p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
2517 }
fe8ab488
A
2518 }
2519
39236c6e
A
2520 TAILQ_REMOVE(&bucket->list, p, p_memstat_list);
2521 bucket->count--;
2522
2523 memorystatus_list_count--;
316670eb 2524
39236c6e 2525 /* If awaiting demotion to the idle band, clean up */
39037602 2526 if (reschedule) {
39236c6e
A
2527 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
2528 memorystatus_reschedule_idle_demotion_locked();
2529 }
316670eb 2530
39236c6e
A
2531 memorystatus_check_levels_locked();
2532
2533#if CONFIG_FREEZE
2534 if (p->p_memstat_state & (P_MEMSTAT_FROZEN)) {
2535 memorystatus_frozen_count--;
2536 }
316670eb 2537
39236c6e
A
2538 if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
2539 memorystatus_suspended_footprint_total -= p->p_memstat_suspendedfootprint;
2540 memorystatus_suspended_count--;
316670eb 2541 }
39236c6e
A
2542#endif
2543
2544 if (!locked) {
2545 proc_list_unlock();
2546 }
316670eb 2547
39236c6e
A
2548 if (p) {
2549 ret = 0;
316670eb 2550 } else {
39236c6e 2551 ret = ESRCH;
316670eb
A
2552 }
2553
2554 return ret;
2555}
2556
3e170ce0
A
2557/*
2558 * Validate dirty tracking flags with process state.
2559 *
2560 * Return:
2561 * 0 on success
2562 * non-0 on failure
39037602
A
2563 *
2564 * The proc_list_lock is held by the caller.
3e170ce0
A
2565 */
2566
2567static int
39236c6e
A
2568memorystatus_validate_track_flags(struct proc *target_p, uint32_t pcontrol) {
2569 /* See that the process isn't marked for termination */
2570 if (target_p->p_memstat_dirty & P_DIRTY_TERMINATED) {
3e170ce0 2571 return EBUSY;
316670eb
A
2572 }
2573
39236c6e
A
2574 /* Idle exit requires that process be tracked */
2575 if ((pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) &&
2576 !(pcontrol & PROC_DIRTY_TRACK)) {
3e170ce0 2577 return EINVAL;
39236c6e
A
2578 }
2579
fe8ab488
A
2580 /* 'Launch in progress' tracking requires that process have enabled dirty tracking too. */
2581 if ((pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) &&
2582 !(pcontrol & PROC_DIRTY_TRACK)) {
3e170ce0 2583 return EINVAL;
fe8ab488
A
2584 }
2585
39236c6e
A
2586 /* Deferral is only relevant if idle exit is specified */
2587 if ((pcontrol & PROC_DIRTY_DEFER) &&
2588 !(pcontrol & PROC_DIRTY_ALLOWS_IDLE_EXIT)) {
3e170ce0 2589 return EINVAL;
316670eb
A
2590 }
2591
3e170ce0 2592 return(0);
316670eb 2593}
593a1d5f 2594
39236c6e
A
2595static void
2596memorystatus_update_idle_priority_locked(proc_t p) {
2597 int32_t priority;
3e170ce0 2598
39236c6e 2599 MEMORYSTATUS_DEBUG(1, "memorystatus_update_idle_priority_locked(): pid %d dirty 0x%X\n", p->p_pid, p->p_memstat_dirty);
39037602
A
2600
2601 assert(isSysProc(p));
2602
39236c6e 2603 if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED) {
39037602
A
2604
2605 priority = (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) ? system_procs_aging_band : JETSAM_PRIORITY_IDLE;
39236c6e
A
2606 } else {
2607 priority = p->p_memstat_requestedpriority;
2608 }
2609
fe8ab488 2610 if (priority != p->p_memstat_effectivepriority) {
39037602
A
2611
2612 if ((jetsam_aging_policy == kJetsamAgingPolicyLegacy) &&
2613 (priority == JETSAM_PRIORITY_IDLE)) {
2614
2615 /*
2616 * This process is on its way into the IDLE band. The system is
2617 * using 'legacy' jetsam aging policy. That means, this process
2618 * has already used up its idle-deferral aging time that is given
2619 * once per its lifetime. So we need to set the INACTIVE limits
2620 * explicitly because it won't be going through the demotion paths
2621 * that take care to apply the limits appropriately.
2622 */
5ba3f43e
A
2623
2624 if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) {
2625
2626 /*
2627 * This process has the 'elevated inactive jetsam band' attribute.
2628 * So, there will be no trip to IDLE after all.
2629 * Instead, we pin the process in the elevated band,
2630 * where its ACTIVE limits will apply.
2631 */
2632
2633 priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
2634 }
2635
39037602
A
2636 memorystatus_update_priority_locked(p, priority, false, true);
2637
2638 } else {
2639 memorystatus_update_priority_locked(p, priority, false, false);
2640 }
fe8ab488 2641 }
39236c6e
A
2642}
2643
2644/*
2645 * Processes can opt to have their state tracked by the kernel, indicating when they are busy (dirty) or idle
2646 * (clean). They may also indicate that they support termination when idle, with the result that they are promoted
2647 * to their desired, higher, jetsam priority when dirty (and are therefore killed later), and demoted to the low
2648 * priority idle band when clean (and killed earlier, protecting higher priority procesess).
2649 *
2650 * If the deferral flag is set, then newly tracked processes will be protected for an initial period (as determined by
39037602 2651 * memorystatus_sysprocs_idle_delay_time); if they go clean during this time, then they will be moved to a deferred-idle band
39236c6e
A
2652 * with a slightly higher priority, guarding against immediate termination under memory pressure and being unable to
2653 * make forward progress. Finally, when the guard expires, they will be moved to the standard, lowest-priority, idle
2654 * band. The deferral can be cleared early by clearing the appropriate flag.
2655 *
2656 * The deferral timer is active only for the duration that the process is marked as guarded and clean; if the process
2657 * is marked dirty, the timer will be cancelled. Upon being subsequently marked clean, the deferment will either be
2658 * re-enabled or the guard state cleared, depending on whether the guard deadline has passed.
2659 */
2660
2661int
2662memorystatus_dirty_track(proc_t p, uint32_t pcontrol) {
2663 unsigned int old_dirty;
2664 boolean_t reschedule = FALSE;
fe8ab488
A
2665 boolean_t already_deferred = FALSE;
2666 boolean_t defer_now = FALSE;
3e170ce0 2667 int ret = 0;
39236c6e 2668
fe8ab488
A
2669 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_TRACK),
2670 p->p_pid, p->p_memstat_dirty, pcontrol, 0, 0);
2671
39236c6e 2672 proc_list_lock();
316670eb 2673
fe8ab488
A
2674 if ((p->p_listflag & P_LIST_EXITED) != 0) {
2675 /*
2676 * Process is on its way out.
2677 */
2678 ret = EBUSY;
2679 goto exit;
2680 }
2681
39236c6e
A
2682 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
2683 ret = EPERM;
2684 goto exit;
316670eb
A
2685 }
2686
3e170ce0
A
2687 if ((ret = memorystatus_validate_track_flags(p, pcontrol)) != 0) {
2688 /* error */
39236c6e 2689 goto exit;
3e170ce0 2690 }
39236c6e
A
2691
2692 old_dirty = p->p_memstat_dirty;
2693
2694 /* These bits are cumulative, as per <rdar://problem/11159924> */
2695 if (pcontrol & PROC_DIRTY_TRACK) {
2696 p->p_memstat_dirty |= P_DIRTY_TRACK;
2697 }
2698
2699 if (pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) {
2700 p->p_memstat_dirty |= P_DIRTY_ALLOW_IDLE_EXIT;
2701 }
2702
fe8ab488
A
2703 if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
2704 p->p_memstat_dirty |= P_DIRTY_LAUNCH_IN_PROGRESS;
2705 }
2706
39037602 2707 if (old_dirty & P_DIRTY_AGING_IN_PROGRESS) {
fe8ab488
A
2708 already_deferred = TRUE;
2709 }
2710
39037602 2711
39236c6e 2712 /* This can be set and cleared exactly once. */
fe8ab488
A
2713 if (pcontrol & PROC_DIRTY_DEFER) {
2714
2715 if ( !(old_dirty & P_DIRTY_DEFER)) {
2716 p->p_memstat_dirty |= P_DIRTY_DEFER;
2717 }
2718
2719 defer_now = TRUE;
39236c6e
A
2720 }
2721
3e170ce0 2722 MEMORYSTATUS_DEBUG(1, "memorystatus_on_track_dirty(): set idle-exit %s / defer %s / dirty %s for pid %d\n",
39236c6e 2723 ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) ? "Y" : "N",
fe8ab488 2724 defer_now ? "Y" : "N",
39236c6e
A
2725 p->p_memstat_dirty & P_DIRTY ? "Y" : "N",
2726 p->p_pid);
2727
2728 /* Kick off or invalidate the idle exit deferment if there's a state transition. */
2729 if (!(p->p_memstat_dirty & P_DIRTY_IS_DIRTY)) {
39037602 2730 if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
fe8ab488 2731
39037602
A
2732 if (defer_now && !already_deferred) {
2733
2734 /*
2735 * Request to defer a clean process that's idle-exit enabled
2736 * and not already in the jetsam deferred band. Most likely a
2737 * new launch.
2738 */
2739 memorystatus_schedule_idle_demotion_locked(p, TRUE);
2740 reschedule = TRUE;
fe8ab488 2741
39037602
A
2742 } else if (!defer_now) {
2743
2744 /*
2745 * The process isn't asking for the 'aging' facility.
2746 * Could be that it is:
2747 */
2748
2749 if (already_deferred) {
2750 /*
2751 * already in the aging bands. Traditionally,
2752 * some processes have tried to use this to
2753 * opt out of the 'aging' facility.
2754 */
2755
2756 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
2757 } else {
2758 /*
2759 * agnostic to the 'aging' facility. In that case,
2760 * we'll go ahead and opt it in because this is likely
2761 * a new launch (clean process, dirty tracking enabled)
2762 */
2763
2764 memorystatus_schedule_idle_demotion_locked(p, TRUE);
2765 }
2766
2767 reschedule = TRUE;
2768 }
fe8ab488
A
2769 }
2770 } else {
2771
2772 /*
2773 * We are trying to operate on a dirty process. Dirty processes have to
2774 * be removed from the deferred band. The question is do we reset the
2775 * deferred state or not?
2776 *
2777 * This could be a legal request like:
39037602 2778 * - this process had opted into the 'aging' band
fe8ab488
A
2779 * - but it's now dirty and requests to opt out.
2780 * In this case, we remove the process from the band and reset its
2781 * state too. It'll opt back in properly when needed.
2782 *
2783 * OR, this request could be a user-space bug. E.g.:
39037602 2784 * - this process had opted into the 'aging' band when clean
fe8ab488
A
2785 * - and, then issues another request to again put it into the band except
2786 * this time the process is dirty.
2787 * The process going dirty, as a transition in memorystatus_dirty_set(), will pull the process out of
2788 * the deferred band with its state intact. So our request below is no-op.
2789 * But we do it here anyways for coverage.
2790 *
2791 * memorystatus_update_idle_priority_locked()
39037602 2792 * single-mindedly treats a dirty process as "cannot be in the aging band".
fe8ab488
A
2793 */
2794
2795 if (!defer_now && already_deferred) {
39236c6e
A
2796 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
2797 reschedule = TRUE;
fe8ab488 2798 } else {
39037602
A
2799
2800 boolean_t reset_state = (jetsam_aging_policy != kJetsamAgingPolicyLegacy) ? TRUE : FALSE;
2801
2802 memorystatus_invalidate_idle_demotion_locked(p, reset_state);
fe8ab488 2803 reschedule = TRUE;
316670eb
A
2804 }
2805 }
39236c6e
A
2806
2807 memorystatus_update_idle_priority_locked(p);
2808
2809 if (reschedule) {
2810 memorystatus_reschedule_idle_demotion_locked();
2811 }
2812
2813 ret = 0;
316670eb 2814
39236c6e
A
2815exit:
2816 proc_list_unlock();
316670eb
A
2817
2818 return ret;
2819}
2d21ac55 2820
39236c6e
A
2821int
2822memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) {
2823 int ret;
2824 boolean_t kill = false;
2825 boolean_t reschedule = FALSE;
2826 boolean_t was_dirty = FALSE;
2827 boolean_t now_dirty = FALSE;
6d2010ae 2828
39236c6e 2829 MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_set(): %d %d 0x%x 0x%x\n", self, p->p_pid, pcontrol, p->p_memstat_dirty);
fe8ab488 2830 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_SET), p->p_pid, self, pcontrol, 0, 0);
b0d623f7 2831
39236c6e
A
2832 proc_list_lock();
2833
fe8ab488
A
2834 if ((p->p_listflag & P_LIST_EXITED) != 0) {
2835 /*
2836 * Process is on its way out.
2837 */
2838 ret = EBUSY;
2839 goto exit;
2840 }
2841
39236c6e
A
2842 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
2843 ret = EPERM;
2844 goto exit;
2845 }
2846
2847 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY)
2848 was_dirty = TRUE;
2849
2850 if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
2851 /* Dirty tracking not enabled */
2852 ret = EINVAL;
2853 } else if (pcontrol && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
2854 /*
2855 * Process is set to be terminated and we're attempting to mark it dirty.
2856 * Set for termination and marking as clean is OK - see <rdar://problem/10594349>.
2857 */
2858 ret = EBUSY;
2859 } else {
2860 int flag = (self == TRUE) ? P_DIRTY : P_DIRTY_SHUTDOWN;
2861 if (pcontrol && !(p->p_memstat_dirty & flag)) {
2862 /* Mark the process as having been dirtied at some point */
2863 p->p_memstat_dirty |= (flag | P_DIRTY_MARKED);
2864 memorystatus_dirty_count++;
2865 ret = 0;
2866 } else if ((pcontrol == 0) && (p->p_memstat_dirty & flag)) {
3e170ce0 2867 if ((flag == P_DIRTY_SHUTDOWN) && (!(p->p_memstat_dirty & P_DIRTY))) {
39236c6e
A
2868 /* Clearing the dirty shutdown flag, and the process is otherwise clean - kill */
2869 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
2870 kill = true;
2871 } else if ((flag == P_DIRTY) && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
2872 /* Kill previously terminated processes if set clean */
2873 kill = true;
2874 }
2875 p->p_memstat_dirty &= ~flag;
2876 memorystatus_dirty_count--;
2877 ret = 0;
2878 } else {
2879 /* Already set */
2880 ret = EALREADY;
316670eb 2881 }
39236c6e
A
2882 }
2883
2884 if (ret != 0) {
2885 goto exit;
2886 }
3e170ce0 2887
39236c6e
A
2888 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY)
2889 now_dirty = TRUE;
2890
2891 if ((was_dirty == TRUE && now_dirty == FALSE) ||
2892 (was_dirty == FALSE && now_dirty == TRUE)) {
2893
2894 /* Manage idle exit deferral, if applied */
39037602 2895 if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
fe8ab488
A
2896
2897 /*
39037602
A
2898 * Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band OR it might be heading back
2899 * there once it's clean again. For the legacy case, this only applies if it has some protection window left.
2900 *
2901 * Non-Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band. It will always stop over
2902 * in that band on it's way to IDLE.
fe8ab488
A
2903 */
2904
39236c6e 2905 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
fe8ab488
A
2906 /*
2907 * New dirty process i.e. "was_dirty == FALSE && now_dirty == TRUE"
2908 *
39037602
A
2909 * The process will move from its aging band to its higher requested
2910 * jetsam band.
fe8ab488 2911 */
39037602
A
2912 boolean_t reset_state = (jetsam_aging_policy != kJetsamAgingPolicyLegacy) ? TRUE : FALSE;
2913
2914 memorystatus_invalidate_idle_demotion_locked(p, reset_state);
39236c6e
A
2915 reschedule = TRUE;
2916 } else {
fe8ab488
A
2917
2918 /*
2919 * Process is back from "dirty" to "clean".
fe8ab488
A
2920 */
2921
39037602
A
2922 if (jetsam_aging_policy == kJetsamAgingPolicyLegacy) {
2923 if (mach_absolute_time() >= p->p_memstat_idledeadline) {
2924 /*
2925 * The process' deadline has expired. It currently
2926 * does not reside in any of the aging buckets.
2927 *
2928 * It's on its way to the JETSAM_PRIORITY_IDLE
2929 * bucket via memorystatus_update_idle_priority_locked()
2930 * below.
2931
2932 * So all we need to do is reset all the state on the
2933 * process that's related to the aging bucket i.e.
2934 * the AGING_IN_PROGRESS flag and the timer deadline.
2935 */
fe8ab488 2936
39037602
A
2937 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
2938 reschedule = TRUE;
2939 } else {
2940 /*
2941 * It still has some protection window left and so
2942 * we just re-arm the timer without modifying any
2943 * state on the process iff it still wants into that band.
2944 */
2945
2946 if (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) {
2947 memorystatus_schedule_idle_demotion_locked(p, FALSE);
2948 reschedule = TRUE;
2949 }
2950 }
39236c6e 2951 } else {
39037602
A
2952
2953 memorystatus_schedule_idle_demotion_locked(p, TRUE);
39236c6e
A
2954 reschedule = TRUE;
2955 }
2956 }
2957 }
3e170ce0 2958
39236c6e 2959 memorystatus_update_idle_priority_locked(p);
3e170ce0 2960
3e170ce0 2961 if (memorystatus_highwater_enabled) {
813fb2f6
A
2962 boolean_t ledger_update_needed = TRUE;
2963 boolean_t use_active;
2964 boolean_t is_fatal;
3e170ce0
A
2965 /*
2966 * We are in this path because this process transitioned between
2967 * dirty <--> clean state. Update the cached memory limits.
2968 */
2969
2970 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
2971 /*
5ba3f43e
A
2972 * process is pinned in elevated band
2973 * or
3e170ce0
A
2974 * process is dirty
2975 */
813fb2f6
A
2976 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
2977 use_active = TRUE;
39037602 2978 ledger_update_needed = TRUE;
3e170ce0
A
2979 } else {
2980 /*
39037602
A
2981 * process is clean...but if it has opted into pressured-exit
2982 * we don't apply the INACTIVE limit till the process has aged
2983 * out and is entering the IDLE band.
2984 * See memorystatus_update_priority_locked() for that.
3e170ce0 2985 */
39037602
A
2986
2987 if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) {
2988 ledger_update_needed = FALSE;
2989 } else {
813fb2f6
A
2990 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
2991 use_active = FALSE;
39037602
A
2992 ledger_update_needed = TRUE;
2993 }
3e170ce0
A
2994 }
2995
2996 /*
2997 * Enforce the new limits by writing to the ledger.
2998 *
2999 * This is a hot path and holding the proc_list_lock while writing to the ledgers,
3000 * (where the task lock is taken) is bad. So, we temporarily drop the proc_list_lock.
3001 * We aren't traversing the jetsam bucket list here, so we should be safe.
3002 * See rdar://21394491.
3003 */
3004
39037602 3005 if (ledger_update_needed && proc_ref_locked(p) == p) {
3e170ce0
A
3006 int ledger_limit;
3007 if (p->p_memstat_memlimit > 0) {
3008 ledger_limit = p->p_memstat_memlimit;
3009 } else {
3010 ledger_limit = -1;
3011 }
3012 proc_list_unlock();
813fb2f6 3013 task_set_phys_footprint_limit_internal(p->task, ledger_limit, NULL, use_active, is_fatal);
3e170ce0
A
3014 proc_list_lock();
3015 proc_rele_locked(p);
3016
3017 MEMORYSTATUS_DEBUG(3, "memorystatus_dirty_set: new limit on pid %d (%dMB %s) priority(%d) dirty?=0x%x %s\n",
3018 p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
3019 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, p->p_memstat_dirty,
3020 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
3021 }
3022
3023 }
39236c6e
A
3024
3025 /* If the deferral state changed, reschedule the demotion timer */
3026 if (reschedule) {
3027 memorystatus_reschedule_idle_demotion_locked();
3028 }
3029 }
3e170ce0 3030
39236c6e 3031 if (kill) {
3e170ce0
A
3032 if (proc_ref_locked(p) == p) {
3033 proc_list_unlock();
3034 psignal(p, SIGKILL);
3035 proc_list_lock();
3036 proc_rele_locked(p);
3037 }
39236c6e
A
3038 }
3039
3040exit:
3041 proc_list_unlock();
3042
3043 return ret;
3044}
b0d623f7 3045
39236c6e 3046int
fe8ab488
A
3047memorystatus_dirty_clear(proc_t p, uint32_t pcontrol) {
3048
39236c6e 3049 int ret = 0;
fe8ab488
A
3050
3051 MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_clear(): %d 0x%x 0x%x\n", p->p_pid, pcontrol, p->p_memstat_dirty);
39236c6e 3052
fe8ab488
A
3053 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_CLEAR), p->p_pid, pcontrol, 0, 0, 0);
3054
3055 proc_list_lock();
3056
3057 if ((p->p_listflag & P_LIST_EXITED) != 0) {
3058 /*
3059 * Process is on its way out.
3060 */
3061 ret = EBUSY;
3062 goto exit;
3063 }
3064
3065 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3066 ret = EPERM;
3067 goto exit;
3068 }
3069
3070 if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
3071 /* Dirty tracking not enabled */
3072 ret = EINVAL;
3073 goto exit;
3074 }
3075
3076 if (!pcontrol || (pcontrol & (PROC_DIRTY_LAUNCH_IN_PROGRESS | PROC_DIRTY_DEFER)) == 0) {
3077 ret = EINVAL;
3078 goto exit;
3079 }
3080
3081 if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
3082 p->p_memstat_dirty &= ~P_DIRTY_LAUNCH_IN_PROGRESS;
3083 }
3084
3085 /* This can be set and cleared exactly once. */
3086 if (pcontrol & PROC_DIRTY_DEFER) {
3087
3088 if (p->p_memstat_dirty & P_DIRTY_DEFER) {
3089
3090 p->p_memstat_dirty &= ~P_DIRTY_DEFER;
3091
3092 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
3093 memorystatus_update_idle_priority_locked(p);
3094 memorystatus_reschedule_idle_demotion_locked();
3095 }
3096 }
3097
3098 ret = 0;
3099exit:
3100 proc_list_unlock();
3101
3102 return ret;
3103}
3104
3105int
3106memorystatus_dirty_get(proc_t p) {
3107 int ret = 0;
3108
3109 proc_list_lock();
3110
3111 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
39236c6e
A
3112 ret |= PROC_DIRTY_TRACKED;
3113 if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) {
3114 ret |= PROC_DIRTY_ALLOWS_IDLE_EXIT;
3115 }
3116 if (p->p_memstat_dirty & P_DIRTY) {
3117 ret |= PROC_DIRTY_IS_DIRTY;
3118 }
fe8ab488
A
3119 if (p->p_memstat_dirty & P_DIRTY_LAUNCH_IN_PROGRESS) {
3120 ret |= PROC_DIRTY_LAUNCH_IS_IN_PROGRESS;
3121 }
39236c6e
A
3122 }
3123
3124 proc_list_unlock();
3125
3126 return ret;
3127}
b0d623f7 3128
39236c6e
A
3129int
3130memorystatus_on_terminate(proc_t p) {
3131 int sig;
3132
3133 proc_list_lock();
3134
3135 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
3136
3137 if ((p->p_memstat_dirty & (P_DIRTY_TRACK|P_DIRTY_IS_DIRTY)) == P_DIRTY_TRACK) {
3138 /* Clean; mark as terminated and issue SIGKILL */
3139 sig = SIGKILL;
3140 } else {
3141 /* Dirty, terminated, or state tracking is unsupported; issue SIGTERM to allow cleanup */
3142 sig = SIGTERM;
316670eb 3143 }
39236c6e
A
3144
3145 proc_list_unlock();
3146
3147 return sig;
316670eb 3148}
b0d623f7 3149
316670eb 3150void
39236c6e
A
3151memorystatus_on_suspend(proc_t p)
3152{
316670eb 3153#if CONFIG_FREEZE
39236c6e 3154 uint32_t pages;
fe8ab488 3155 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
39236c6e
A
3156#endif
3157 proc_list_lock();
3158#if CONFIG_FREEZE
3159 p->p_memstat_suspendedfootprint = pages;
3160 memorystatus_suspended_footprint_total += pages;
3161 memorystatus_suspended_count++;
316670eb 3162#endif
39236c6e
A
3163 p->p_memstat_state |= P_MEMSTAT_SUSPENDED;
3164 proc_list_unlock();
3165}
6d2010ae 3166
39236c6e
A
3167void
3168memorystatus_on_resume(proc_t p)
3169{
3170#if CONFIG_FREEZE
3171 boolean_t frozen;
3172 pid_t pid;
3173#endif
6d2010ae 3174
39236c6e 3175 proc_list_lock();
6d2010ae 3176
316670eb 3177#if CONFIG_FREEZE
39236c6e
A
3178 frozen = (p->p_memstat_state & P_MEMSTAT_FROZEN);
3179 if (frozen) {
3180 memorystatus_frozen_count--;
3181 p->p_memstat_state |= P_MEMSTAT_PRIOR_THAW;
3182 }
3183
3184 memorystatus_suspended_footprint_total -= p->p_memstat_suspendedfootprint;
3185 memorystatus_suspended_count--;
3186
3187 pid = p->p_pid;
316670eb 3188#endif
39236c6e
A
3189
3190 p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN);
3191
3192 proc_list_unlock();
3193
3194#if CONFIG_FREEZE
3195 if (frozen) {
3196 memorystatus_freeze_entry_t data = { pid, FALSE, 0 };
3197 memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
316670eb 3198 }
39236c6e 3199#endif
316670eb 3200}
6d2010ae 3201
316670eb 3202void
39236c6e 3203memorystatus_on_inactivity(proc_t p)
6d2010ae 3204{
39236c6e 3205#pragma unused(p)
316670eb
A
3206#if CONFIG_FREEZE
3207 /* Wake the freeze thread */
3208 thread_wakeup((event_t)&memorystatus_freeze_wakeup);
3209#endif
3210}
6d2010ae 3211
39037602
A
3212/*
3213 * The proc_list_lock is held by the caller.
3214*/
39236c6e
A
3215static uint32_t
3216memorystatus_build_state(proc_t p) {
3217 uint32_t snapshot_state = 0;
3218
3219 /* General */
3220 if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
3221 snapshot_state |= kMemorystatusSuspended;
3222 }
3223 if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
3224 snapshot_state |= kMemorystatusFrozen;
3225 }
3226 if (p->p_memstat_state & P_MEMSTAT_PRIOR_THAW) {
3227 snapshot_state |= kMemorystatusWasThawed;
3228 }
3229
3230 /* Tracking */
3231 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
3232 snapshot_state |= kMemorystatusTracked;
3233 }
3234 if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
3235 snapshot_state |= kMemorystatusSupportsIdleExit;
3236 }
3237 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
3238 snapshot_state |= kMemorystatusDirty;
3239 }
3240
3241 return snapshot_state;
3242}
3243
39236c6e
A
3244static boolean_t
3245kill_idle_exit_proc(void)
316670eb 3246{
39236c6e 3247 proc_t p, victim_p = PROC_NULL;
316670eb 3248 uint64_t current_time;
39236c6e
A
3249 boolean_t killed = FALSE;
3250 unsigned int i = 0;
39037602 3251 os_reason_t jetsam_reason = OS_REASON_NULL;
316670eb 3252
39236c6e 3253 /* Pick next idle exit victim. */
316670eb 3254 current_time = mach_absolute_time();
6d2010ae 3255
39037602
A
3256 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_IDLE_EXIT);
3257 if (jetsam_reason == OS_REASON_NULL) {
3258 printf("kill_idle_exit_proc: failed to allocate jetsam reason\n");
3259 }
3260
39236c6e 3261 proc_list_lock();
6d2010ae 3262
39236c6e
A
3263 p = memorystatus_get_first_proc_locked(&i, FALSE);
3264 while (p) {
3265 /* No need to look beyond the idle band */
3266 if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
3267 break;
3268 }
3269
3270 if ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT|P_DIRTY_IS_DIRTY|P_DIRTY_TERMINATED)) == (P_DIRTY_ALLOW_IDLE_EXIT)) {
3271 if (current_time >= p->p_memstat_idledeadline) {
3272 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
3273 victim_p = proc_ref_locked(p);
3274 break;
316670eb
A
3275 }
3276 }
39236c6e
A
3277
3278 p = memorystatus_get_next_proc_locked(&i, p, FALSE);
6d2010ae 3279 }
316670eb 3280
39236c6e
A
3281 proc_list_unlock();
3282
3283 if (victim_p) {
5ba3f43e 3284 printf("memorystatus: killing_idle_process pid %d [%s]\n", victim_p->p_pid, (*victim_p->p_name ? victim_p->p_name : "unknown"));
39037602 3285 killed = memorystatus_do_kill(victim_p, kMemorystatusKilledIdleExit, jetsam_reason);
39236c6e 3286 proc_rele(victim_p);
39037602
A
3287 } else {
3288 os_reason_free(jetsam_reason);
316670eb 3289 }
b0d623f7 3290
39236c6e 3291 return killed;
2d21ac55
A
3292}
3293
39236c6e
A
3294static void
3295memorystatus_thread_wake(void) {
3296 thread_wakeup((event_t)&memorystatus_wakeup);
b0d623f7 3297}
fe8ab488
A
3298
3299extern void vm_pressure_response(void);
b0d623f7 3300
316670eb 3301static int
39236c6e
A
3302memorystatus_thread_block(uint32_t interval_ms, thread_continue_t continuation)
3303{
3304 if (interval_ms) {
3305 assert_wait_timeout(&memorystatus_wakeup, THREAD_UNINT, interval_ms, 1000 * NSEC_PER_USEC);
3306 } else {
3307 assert_wait(&memorystatus_wakeup, THREAD_UNINT);
3308 }
316670eb 3309
39236c6e
A
3310 return thread_block(continuation);
3311}
316670eb 3312
5ba3f43e
A
3313static boolean_t
3314memorystatus_avail_pages_below_pressure(void)
3315{
3316#if CONFIG_EMBEDDED
3317/*
3318 * Instead of CONFIG_EMBEDDED for these *avail_pages* routines, we should
3319 * key off of the system having dynamic swap support. With full swap support,
3320 * the system shouldn't really need to worry about various page thresholds.
3321 */
3322 return (memorystatus_available_pages <= memorystatus_available_pages_pressure);
3323#else /* CONFIG_EMBEDDED */
3324 return FALSE;
3325#endif /* CONFIG_EMBEDDED */
3326}
3327
3328static boolean_t
3329memorystatus_avail_pages_below_critical(void)
3330{
3331#if CONFIG_EMBEDDED
3332 return (memorystatus_available_pages <= memorystatus_available_pages_critical);
3333#else /* CONFIG_EMBEDDED */
3334 return FALSE;
3335#endif /* CONFIG_EMBEDDED */
3336}
3337
3338static boolean_t
3339memorystatus_post_snapshot(int32_t priority, uint32_t cause)
3340{
3341#if CONFIG_EMBEDDED
3342#pragma unused(cause)
3343 /*
3344 * Don't generate logs for steady-state idle-exit kills,
3345 * unless it is overridden for debug or by the device
3346 * tree.
3347 */
3348
3349 return ((priority != JETSAM_PRIORITY_IDLE) || memorystatus_idle_snapshot);
3350
3351#else /* CONFIG_EMBEDDED */
3352 /*
3353 * Don't generate logs for steady-state idle-exit kills,
3354 * unless
3355 * - it is overridden for debug or by the device
3356 * tree.
3357 * OR
3358 * - the kill causes are important i.e. not kMemorystatusKilledIdleExit
3359 */
3360
3361 boolean_t snapshot_eligible_kill_cause = (is_reason_thrashing(cause) || is_reason_zone_map_exhaustion(cause));
3362 return ((priority != JETSAM_PRIORITY_IDLE) || memorystatus_idle_snapshot || snapshot_eligible_kill_cause);
3363#endif /* CONFIG_EMBEDDED */
3364}
3365
3366static boolean_t
3367memorystatus_action_needed(void)
3368{
3369#if CONFIG_EMBEDDED
3370 return (is_reason_thrashing(kill_under_pressure_cause) ||
3371 is_reason_zone_map_exhaustion(kill_under_pressure_cause) ||
3372 memorystatus_available_pages <= memorystatus_available_pages_pressure);
3373#else /* CONFIG_EMBEDDED */
3374 return (is_reason_thrashing(kill_under_pressure_cause) ||
3375 is_reason_zone_map_exhaustion(kill_under_pressure_cause));
3376#endif /* CONFIG_EMBEDDED */
3377}
3378
3379static boolean_t
3380memorystatus_act_on_hiwat_processes(uint32_t *errors, uint32_t *hwm_kill, boolean_t *post_snapshot, __unused boolean_t *is_critical)
3381{
3382 boolean_t killed = memorystatus_kill_hiwat_proc(errors);
3383
3384 if (killed) {
3385 *hwm_kill = *hwm_kill + 1;
3386 *post_snapshot = TRUE;
3387 return TRUE;
3388 } else {
3389 memorystatus_hwm_candidates = FALSE;
3390 }
3391
3392#if CONFIG_JETSAM
3393 /* No highwater processes to kill. Continue or stop for now? */
3394 if (!is_reason_thrashing(kill_under_pressure_cause) &&
3395 !is_reason_zone_map_exhaustion(kill_under_pressure_cause) &&
3396 (memorystatus_available_pages > memorystatus_available_pages_critical)) {
3397 /*
3398 * We are _not_ out of pressure but we are above the critical threshold and there's:
3399 * - no compressor thrashing
3400 * - enough zone memory
3401 * - no more HWM processes left.
3402 * For now, don't kill any other processes.
3403 */
3404
3405 if (*hwm_kill == 0) {
3406 memorystatus_thread_wasted_wakeup++;
3407 }
3408
3409 *is_critical = FALSE;
3410
3411 return TRUE;
3412 }
3413#endif /* CONFIG_JETSAM */
3414
3415 return FALSE;
3416}
3417
3418static boolean_t
3419memorystatus_act_aggressive(uint32_t cause, os_reason_t jetsam_reason, int *jld_idle_kills, boolean_t *corpse_list_purged, boolean_t *post_snapshot)
3420{
3421 if (memorystatus_jld_enabled == TRUE) {
3422
3423 boolean_t killed;
3424 uint32_t errors = 0;
3425
3426 /* Jetsam Loop Detection - locals */
3427 memstat_bucket_t *bucket;
3428 int jld_bucket_count = 0;
3429 struct timeval jld_now_tstamp = {0,0};
3430 uint64_t jld_now_msecs = 0;
3431 int elevated_bucket_count = 0;
3432
3433 /* Jetsam Loop Detection - statics */
3434 static uint64_t jld_timestamp_msecs = 0;
3435 static int jld_idle_kill_candidates = 0; /* Number of available processes in band 0,1 at start */
3436 static int jld_eval_aggressive_count = 0; /* Bumps the max priority in aggressive loop */
3437 static int32_t jld_priority_band_max = JETSAM_PRIORITY_UI_SUPPORT;
3438 /*
3439 * Jetsam Loop Detection: attempt to detect
3440 * rapid daemon relaunches in the lower bands.
3441 */
3442
3443 microuptime(&jld_now_tstamp);
3444
3445 /*
3446 * Ignore usecs in this calculation.
3447 * msecs granularity is close enough.
3448 */
3449 jld_now_msecs = (jld_now_tstamp.tv_sec * 1000);
3450
3451 proc_list_lock();
3452 switch (jetsam_aging_policy) {
3453 case kJetsamAgingPolicyLegacy:
3454 bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
3455 jld_bucket_count = bucket->count;
3456 bucket = &memstat_bucket[JETSAM_PRIORITY_AGING_BAND1];
3457 jld_bucket_count += bucket->count;
3458 break;
3459 case kJetsamAgingPolicySysProcsReclaimedFirst:
3460 case kJetsamAgingPolicyAppsReclaimedFirst:
3461 bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
3462 jld_bucket_count = bucket->count;
3463 bucket = &memstat_bucket[system_procs_aging_band];
3464 jld_bucket_count += bucket->count;
3465 bucket = &memstat_bucket[applications_aging_band];
3466 jld_bucket_count += bucket->count;
3467 break;
3468 case kJetsamAgingPolicyNone:
3469 default:
3470 bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
3471 jld_bucket_count = bucket->count;
3472 break;
3473 }
3474
3475 bucket = &memstat_bucket[JETSAM_PRIORITY_ELEVATED_INACTIVE];
3476 elevated_bucket_count = bucket->count;
3477
3478 proc_list_unlock();
3479
3480 /*
3481 * memorystatus_jld_eval_period_msecs is a tunable
3482 * memorystatus_jld_eval_aggressive_count is a tunable
3483 * memorystatus_jld_eval_aggressive_priority_band_max is a tunable
3484 */
3485 if ( (jld_bucket_count == 0) ||
3486 (jld_now_msecs > (jld_timestamp_msecs + memorystatus_jld_eval_period_msecs))) {
3487
3488 /*
3489 * Refresh evaluation parameters
3490 */
3491 jld_timestamp_msecs = jld_now_msecs;
3492 jld_idle_kill_candidates = jld_bucket_count;
3493 *jld_idle_kills = 0;
3494 jld_eval_aggressive_count = 0;
3495 jld_priority_band_max = JETSAM_PRIORITY_UI_SUPPORT;
3496 }
3497
3498 if (*jld_idle_kills > jld_idle_kill_candidates) {
3499 jld_eval_aggressive_count++;
3500
3501#if DEVELOPMENT || DEBUG
3502 printf("memorystatus: aggressive%d: beginning of window: %lld ms, : timestamp now: %lld ms\n",
3503 jld_eval_aggressive_count,
3504 jld_timestamp_msecs,
3505 jld_now_msecs);
3506 printf("memorystatus: aggressive%d: idle candidates: %d, idle kills: %d\n",
3507 jld_eval_aggressive_count,
3508 jld_idle_kill_candidates,
3509 *jld_idle_kills);
3510#endif /* DEVELOPMENT || DEBUG */
3511
3512 if ((jld_eval_aggressive_count == memorystatus_jld_eval_aggressive_count) &&
3513 (total_corpses_count() > 0) && (*corpse_list_purged == FALSE)) {
3514 /*
3515 * If we reach this aggressive cycle, corpses might be causing memory pressure.
3516 * So, in an effort to avoid jetsams in the FG band, we will attempt to purge
3517 * corpse memory prior to this final march through JETSAM_PRIORITY_UI_SUPPORT.
3518 */
3519 task_purge_all_corpses();
3520 *corpse_list_purged = TRUE;
3521 }
3522 else if (jld_eval_aggressive_count > memorystatus_jld_eval_aggressive_count) {
3523 /*
3524 * Bump up the jetsam priority limit (eg: the bucket index)
3525 * Enforce bucket index sanity.
3526 */
3527 if ((memorystatus_jld_eval_aggressive_priority_band_max < 0) ||
3528 (memorystatus_jld_eval_aggressive_priority_band_max >= MEMSTAT_BUCKET_COUNT)) {
3529 /*
3530 * Do nothing. Stick with the default level.
3531 */
3532 } else {
3533 jld_priority_band_max = memorystatus_jld_eval_aggressive_priority_band_max;
3534 }
3535 }
3536
3537 /* Visit elevated processes first */
3538 while (elevated_bucket_count) {
3539
3540 elevated_bucket_count--;
3541
3542 /*
3543 * memorystatus_kill_elevated_process() drops a reference,
3544 * so take another one so we can continue to use this exit reason
3545 * even after it returns.
3546 */
3547
3548 os_reason_ref(jetsam_reason);
3549 killed = memorystatus_kill_elevated_process(
3550 cause,
3551 jetsam_reason,
3552 jld_eval_aggressive_count,
3553 &errors);
3554
3555 if (killed) {
3556 *post_snapshot = TRUE;
3557 if (memorystatus_avail_pages_below_pressure()) {
3558 /*
3559 * Still under pressure.
3560 * Find another pinned processes.
3561 */
3562 continue;
3563 } else {
3564 return TRUE;
3565 }
3566 } else {
3567 /*
3568 * No pinned processes left to kill.
3569 * Abandon elevated band.
3570 */
3571 break;
3572 }
3573 }
3574
3575 /*
3576 * memorystatus_kill_top_process_aggressive() allocates its own
3577 * jetsam_reason so the kMemorystatusKilledVMThrashing cause
3578 * is consistent throughout the aggressive march.
3579 */
3580 killed = memorystatus_kill_top_process_aggressive(
3581 kMemorystatusKilledVMThrashing,
3582 jld_eval_aggressive_count,
3583 jld_priority_band_max,
3584 &errors);
3585
3586 if (killed) {
3587 /* Always generate logs after aggressive kill */
3588 *post_snapshot = TRUE;
3589 *jld_idle_kills = 0;
3590 return TRUE;
3591 }
3592 }
3593
3594 return FALSE;
3595 }
3596
3597 return FALSE;
3598}
3599
3600
39236c6e
A
3601static void
3602memorystatus_thread(void *param __unused, wait_result_t wr __unused)
3603{
3604 static boolean_t is_vm_privileged = FALSE;
3e170ce0 3605
39236c6e
A
3606 boolean_t post_snapshot = FALSE;
3607 uint32_t errors = 0;
fe8ab488 3608 uint32_t hwm_kill = 0;
3e170ce0 3609 boolean_t sort_flag = TRUE;
39037602 3610 boolean_t corpse_list_purged = FALSE;
5ba3f43e 3611 int jld_idle_kills = 0;
316670eb 3612
39236c6e
A
3613 if (is_vm_privileged == FALSE) {
3614 /*
3615 * It's the first time the thread has run, so just mark the thread as privileged and block.
3616 * This avoids a spurious pass with unset variables, as set out in <rdar://problem/9609402>.
3617 */
3618 thread_wire(host_priv_self(), current_thread(), TRUE);
3619 is_vm_privileged = TRUE;
3620
3e170ce0
A
3621 if (vm_restricted_to_single_processor == TRUE)
3622 thread_vm_bind_group_add();
5ba3f43e 3623 thread_set_thread_name(current_thread(), "VM_memorystatus");
39236c6e 3624 memorystatus_thread_block(0, memorystatus_thread);
316670eb
A
3625 }
3626
39236c6e 3627 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_START,
3e170ce0 3628 memorystatus_available_pages, memorystatus_jld_enabled, memorystatus_jld_eval_period_msecs, memorystatus_jld_eval_aggressive_count,0);
316670eb 3629
fe8ab488
A
3630 /*
3631 * Jetsam aware version.
3632 *
3633 * The VM pressure notification thread is working it's way through clients in parallel.
39236c6e 3634 *
fe8ab488
A
3635 * So, while the pressure notification thread is targeting processes in order of
3636 * increasing jetsam priority, we can hopefully reduce / stop it's work by killing
3637 * any processes that have exceeded their highwater mark.
39236c6e 3638 *
fe8ab488
A
3639 * If we run out of HWM processes and our available pages drops below the critical threshold, then,
3640 * we target the least recently used process in order of increasing jetsam priority (exception: the FG band).
39236c6e 3641 */
5ba3f43e 3642 while (memorystatus_action_needed()) {
39236c6e
A
3643 boolean_t killed;
3644 int32_t priority;
fe8ab488 3645 uint32_t cause;
39037602
A
3646 uint64_t jetsam_reason_code = JETSAM_REASON_INVALID;
3647 os_reason_t jetsam_reason = OS_REASON_NULL;
fe8ab488 3648
39037602
A
3649 cause = kill_under_pressure_cause;
3650 switch (cause) {
3651 case kMemorystatusKilledFCThrashing:
3652 jetsam_reason_code = JETSAM_REASON_MEMORY_FCTHRASHING;
3653 break;
3654 case kMemorystatusKilledVMThrashing:
3655 jetsam_reason_code = JETSAM_REASON_MEMORY_VMTHRASHING;
3656 break;
5ba3f43e
A
3657 case kMemorystatusKilledZoneMapExhaustion:
3658 jetsam_reason_code = JETSAM_REASON_ZONE_MAP_EXHAUSTION;
3659 break;
39037602
A
3660 case kMemorystatusKilledVMPageShortage:
3661 /* falls through */
3662 default:
3663 jetsam_reason_code = JETSAM_REASON_MEMORY_VMPAGESHORTAGE;
3664 cause = kMemorystatusKilledVMPageShortage;
3665 break;
fe8ab488 3666 }
39236c6e 3667
39236c6e 3668 /* Highwater */
5ba3f43e
A
3669 boolean_t is_critical = TRUE;
3670 if (memorystatus_act_on_hiwat_processes(&errors, &hwm_kill, &post_snapshot, &is_critical)) {
3671 if (is_critical == FALSE) {
3672 /*
3673 * For now, don't kill any other processes.
3674 */
3675 break;
3676 } else {
3677 goto done;
fe8ab488 3678 }
39236c6e 3679 }
39037602
A
3680
3681 jetsam_reason = os_reason_create(OS_REASON_JETSAM, jetsam_reason_code);
3682 if (jetsam_reason == OS_REASON_NULL) {
3683 printf("memorystatus_thread: failed to allocate jetsam reason\n");
3684 }
3685
5ba3f43e
A
3686 if (memorystatus_act_aggressive(cause, jetsam_reason, &jld_idle_kills, &corpse_list_purged, &post_snapshot)) {
3687 goto done;
3e170ce0 3688 }
39037602
A
3689
3690 /*
3691 * memorystatus_kill_top_process() drops a reference,
3692 * so take another one so we can continue to use this exit reason
3693 * even after it returns
3694 */
3695 os_reason_ref(jetsam_reason);
3696
39236c6e 3697 /* LRU */
39037602 3698 killed = memorystatus_kill_top_process(TRUE, sort_flag, cause, jetsam_reason, &priority, &errors);
3e170ce0
A
3699 sort_flag = FALSE;
3700
39236c6e 3701 if (killed) {
5ba3f43e
A
3702 if (memorystatus_post_snapshot(priority, cause) == TRUE) {
3703
39236c6e
A
3704 post_snapshot = TRUE;
3705 }
3e170ce0
A
3706
3707 /* Jetsam Loop Detection */
3708 if (memorystatus_jld_enabled == TRUE) {
39037602 3709 if ((priority == JETSAM_PRIORITY_IDLE) || (priority == system_procs_aging_band) || (priority == applications_aging_band)) {
3e170ce0
A
3710 jld_idle_kills++;
3711 } else {
3712 /*
3713 * We've reached into bands beyond idle deferred.
3714 * We make no attempt to monitor them
3715 */
3716 }
3717 }
39037602 3718
5ba3f43e 3719 if ((priority >= JETSAM_PRIORITY_UI_SUPPORT) && (total_corpses_count() > 0) && (corpse_list_purged == FALSE)) {
39037602
A
3720 /*
3721 * If we have jetsammed a process in or above JETSAM_PRIORITY_UI_SUPPORT
3722 * then we attempt to relieve pressure by purging corpse memory.
3723 */
3724 task_purge_all_corpses();
3725 corpse_list_purged = TRUE;
3726 }
39236c6e
A
3727 goto done;
3728 }
fe8ab488 3729
5ba3f43e 3730 if (memorystatus_avail_pages_below_critical()) {
39037602
A
3731 /*
3732 * Still under pressure and unable to kill a process - purge corpse memory
3733 */
5ba3f43e 3734 if (total_corpses_count() > 0) {
39037602
A
3735 task_purge_all_corpses();
3736 corpse_list_purged = TRUE;
3737 }
3738
5ba3f43e 3739 if (memorystatus_avail_pages_below_critical()) {
39037602
A
3740 /*
3741 * Still under pressure and unable to kill a process - panic
3742 */
5ba3f43e 3743 panic("memorystatus_jetsam_thread: no victim! available pages:%llu\n", (uint64_t)memorystatus_available_pages);
39037602 3744 }
fe8ab488 3745 }
39236c6e
A
3746
3747done:
fe8ab488
A
3748
3749 /*
3750 * We do not want to over-kill when thrashing has been detected.
3751 * To avoid that, we reset the flag here and notify the
3752 * compressor.
39236c6e 3753 */
5ba3f43e 3754 if (is_reason_thrashing(kill_under_pressure_cause)) {
fe8ab488 3755 kill_under_pressure_cause = 0;
5ba3f43e 3756#if CONFIG_JETSAM
fe8ab488 3757 vm_thrashing_jetsam_done();
5ba3f43e
A
3758#endif /* CONFIG_JETSAM */
3759 } else if (is_reason_zone_map_exhaustion(kill_under_pressure_cause)) {
3760 kill_under_pressure_cause = 0;
39236c6e 3761 }
39037602
A
3762
3763 os_reason_free(jetsam_reason);
39236c6e 3764 }
fe8ab488
A
3765
3766 kill_under_pressure_cause = 0;
3767
39236c6e
A
3768 if (errors) {
3769 memorystatus_clear_errors();
3770 }
3771
39236c6e 3772 if (post_snapshot) {
39037602 3773 proc_list_lock();
39236c6e
A
3774 size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
3775 sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count);
3e170ce0
A
3776 uint64_t timestamp_now = mach_absolute_time();
3777 memorystatus_jetsam_snapshot->notification_time = timestamp_now;
39037602 3778 memorystatus_jetsam_snapshot->js_gencount++;
d190cdc3
A
3779 if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
3780 timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
39037602 3781 proc_list_unlock();
3e170ce0
A
3782 int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
3783 if (!ret) {
3784 proc_list_lock();
3785 memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
3786 proc_list_unlock();
3787 }
39037602
A
3788 } else {
3789 proc_list_unlock();
3e170ce0 3790 }
39236c6e 3791 }
3e170ce0 3792
39236c6e
A
3793 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_END,
3794 memorystatus_available_pages, 0, 0, 0, 0);
3795
39236c6e
A
3796 memorystatus_thread_block(0, memorystatus_thread);
3797}
3798
fe8ab488
A
3799/*
3800 * Returns TRUE:
3801 * when an idle-exitable proc was killed
3802 * Returns FALSE:
3803 * when there are no more idle-exitable procs found
3804 * when the attempt to kill an idle-exitable proc failed
3805 */
39236c6e 3806boolean_t memorystatus_idle_exit_from_VM(void) {
5ba3f43e
A
3807
3808 /*
3809 * This routine should no longer be needed since we are
3810 * now using jetsam bands on all platforms and so will deal
3811 * with IDLE processes within the memorystatus thread itself.
3812 *
3813 * But we still use it because we observed that macos systems
3814 * started heavy compression/swapping with a bunch of
3815 * idle-exitable processes alive and doing nothing. We decided
3816 * to rather kill those processes than start swapping earlier.
3817 */
3818
fe8ab488 3819 return(kill_idle_exit_proc());
39236c6e 3820}
39236c6e 3821
39037602
A
3822/*
3823 * Callback invoked when allowable physical memory footprint exceeded
3824 * (dirty pages + IOKit mappings)
3825 *
3826 * This is invoked for both advisory, non-fatal per-task high watermarks,
3827 * as well as the fatal task memory limits.
3828 */
3829void
813fb2f6 3830memorystatus_on_ledger_footprint_exceeded(boolean_t warning, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal)
39037602
A
3831{
3832 os_reason_t jetsam_reason = OS_REASON_NULL;
3833
3834 proc_t p = current_proc();
3835
39236c6e
A
3836#if VM_PRESSURE_EVENTS
3837 if (warning == TRUE) {
39037602
A
3838 /*
3839 * This is a warning path which implies that the current process is close, but has
3840 * not yet exceeded its per-process memory limit.
3841 */
813fb2f6 3842 if (memorystatus_warn_process(p->p_pid, memlimit_is_active, memlimit_is_fatal, FALSE /* not exceeded */) != TRUE) {
39236c6e 3843 /* Print warning, since it's possible that task has not registered for pressure notifications */
5ba3f43e 3844 os_log(OS_LOG_DEFAULT, "memorystatus_on_ledger_footprint_exceeded: failed to warn the current task (%d exiting, or no handler registered?).\n", p->p_pid);
39236c6e
A
3845 }
3846 return;
3847 }
3848#endif /* VM_PRESSURE_EVENTS */
3849
813fb2f6 3850 if (memlimit_is_fatal) {
39236c6e 3851 /*
fe8ab488
A
3852 * If this process has no high watermark or has a fatal task limit, then we have been invoked because the task
3853 * has violated either the system-wide per-task memory limit OR its own task limit.
39236c6e 3854 */
39037602
A
3855 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_PERPROCESSLIMIT);
3856 if (jetsam_reason == NULL) {
3857 printf("task_exceeded footprint: failed to allocate jetsam reason\n");
3858 } else if (corpse_for_fatal_memkill != 0) {
3859 /* Set OS_REASON_FLAG_GENERATE_CRASH_REPORT to generate corpse */
3860 jetsam_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
3861 }
3862
3863 if (memorystatus_kill_process_sync(p->p_pid, kMemorystatusKilledPerProcessLimit, jetsam_reason) != TRUE) {
39236c6e
A
3864 printf("task_exceeded_footprint: failed to kill the current task (exiting?).\n");
3865 }
fe8ab488
A
3866 } else {
3867 /*
3868 * HWM offender exists. Done without locks or synchronization.
3869 * See comment near its declaration for more details.
3870 */
3871 memorystatus_hwm_candidates = TRUE;
39037602
A
3872
3873#if VM_PRESSURE_EVENTS
3874 /*
3875 * The current process is not in the warning path.
3876 * This path implies the current process has exceeded a non-fatal (soft) memory limit.
3877 * Failure to send note is ignored here.
3878 */
813fb2f6 3879 (void)memorystatus_warn_process(p->p_pid, memlimit_is_active, memlimit_is_fatal, TRUE /* exceeded */);
39037602
A
3880
3881#endif /* VM_PRESSURE_EVENTS */
3882 }
3883}
3884
813fb2f6
A
3885void
3886memorystatus_log_exception(const int max_footprint_mb, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal)
3887{
3888 proc_t p = current_proc();
3889
3890 /*
3891 * The limit violation is logged here, but only once per process per limit.
3892 * Soft memory limit is a non-fatal high-water-mark
3893 * Hard memory limit is a fatal custom-task-limit or system-wide per-task memory limit.
3894 */
3895
5ba3f43e
A
3896 os_log_with_startup_serial(OS_LOG_DEFAULT, "EXC_RESOURCE -> %s[%d] exceeded mem limit: %s%s %d MB (%s)\n",
3897 (*p->p_name ? p->p_name : "unknown"), p->p_pid, (memlimit_is_active ? "Active" : "Inactive"),
3898 (memlimit_is_fatal ? "Hard" : "Soft"), max_footprint_mb,
3899 (memlimit_is_fatal ? "fatal" : "non-fatal"));
813fb2f6
A
3900
3901 return;
3902}
3903
3904
39037602
A
3905/*
3906 * Description:
5ba3f43e
A
3907 * Evaluates process state to determine which limit
3908 * should be applied (active vs. inactive limit).
3909 *
3910 * Processes that have the 'elevated inactive jetsam band' attribute
3911 * are first evaluated based on their current priority band.
3912 * presently elevated ==> active
3913 *
39037602
A
3914 * Processes that opt into dirty tracking are evaluated
3915 * based on clean vs dirty state.
3916 * dirty ==> active
3917 * clean ==> inactive
3918 *
3919 * Process that do not opt into dirty tracking are
3920 * evalulated based on priority level.
3921 * Foreground or above ==> active
3922 * Below Foreground ==> inactive
3923 *
3924 * Return: TRUE if active
3925 * False if inactive
3926 */
3927
3928static boolean_t
3929proc_jetsam_state_is_active_locked(proc_t p) {
3930
5ba3f43e
A
3931 if ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) &&
3932 (p->p_memstat_effectivepriority == JETSAM_PRIORITY_ELEVATED_INACTIVE)) {
3933 /*
3934 * process has the 'elevated inactive jetsam band' attribute
3935 * and process is present in the elevated band
3936 * implies active state
3937 */
3938 return TRUE;
3939 } else if (p->p_memstat_dirty & P_DIRTY_TRACK) {
39037602
A
3940 /*
3941 * process has opted into dirty tracking
3942 * active state is based on dirty vs. clean
3943 */
3944 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
3945 /*
3946 * process is dirty
3947 * implies active state
3948 */
3949 return TRUE;
3950 } else {
3951 /*
3952 * process is clean
3953 * implies inactive state
3954 */
3955 return FALSE;
3956 }
3957 } else if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) {
3958 /*
3959 * process is Foreground or higher
3960 * implies active state
3961 */
3962 return TRUE;
3963 } else {
3964 /*
3965 * process found below Foreground
3966 * implies inactive state
3967 */
3968 return FALSE;
3969 }
3970}
3971
3972static boolean_t
3973memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason) {
3974 boolean_t res;
3975
39037602
A
3976 uint32_t errors = 0;
3977
3978 if (victim_pid == -1) {
3979 /* No pid, so kill first process */
3980 res = memorystatus_kill_top_process(TRUE, TRUE, cause, jetsam_reason, NULL, &errors);
3981 } else {
3982 res = memorystatus_kill_specific_process(victim_pid, cause, jetsam_reason);
3983 }
3984
3985 if (errors) {
3986 memorystatus_clear_errors();
3987 }
3988
3989 if (res == TRUE) {
3990 /* Fire off snapshot notification */
3991 proc_list_lock();
3992 size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
3993 sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_count;
3994 uint64_t timestamp_now = mach_absolute_time();
3995 memorystatus_jetsam_snapshot->notification_time = timestamp_now;
d190cdc3
A
3996 if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
3997 timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
39037602
A
3998 proc_list_unlock();
3999 int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
4000 if (!ret) {
4001 proc_list_lock();
4002 memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
4003 proc_list_unlock();
4004 }
4005 } else {
4006 proc_list_unlock();
4007 }
4008 }
39037602 4009
39037602
A
4010 return res;
4011}
4012
4013/*
4014 * Jetsam a specific process.
4015 */
4016static boolean_t
4017memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason) {
4018 boolean_t killed;
4019 proc_t p;
4020 uint64_t killtime = 0;
4021 clock_sec_t tv_sec;
4022 clock_usec_t tv_usec;
4023 uint32_t tv_msec;
4024
4025 /* TODO - add a victim queue and push this into the main jetsam thread */
4026
4027 p = proc_find(victim_pid);
4028 if (!p) {
4029 os_reason_free(jetsam_reason);
4030 return FALSE;
4031 }
4032
4033 proc_list_lock();
4034
39037602
A
4035 if (memorystatus_jetsam_snapshot_count == 0) {
4036 memorystatus_init_jetsam_snapshot_locked(NULL,0);
fe8ab488 4037 }
39037602
A
4038
4039 killtime = mach_absolute_time();
4040 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
4041 tv_msec = tv_usec / 1000;
4042
4043 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
4044
4045 proc_list_unlock();
4046
5ba3f43e
A
4047 os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: killing_specific_process pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n",
4048 (unsigned long)tv_sec, tv_msec, victim_pid, (*p->p_name ? p->p_name : "unknown"),
4049 memorystatus_kill_cause_name[cause], p->p_memstat_effectivepriority, (uint64_t)memorystatus_available_pages);
39037602
A
4050
4051 killed = memorystatus_do_kill(p, cause, jetsam_reason);
4052 proc_rele(p);
4053
4054 return killed;
fe8ab488
A
4055}
4056
39037602 4057
3e170ce0
A
4058/*
4059 * Toggle the P_MEMSTAT_TERMINATED state.
4060 * Takes the proc_list_lock.
4061 */
4062void
4063proc_memstat_terminated(proc_t p, boolean_t set)
4064{
4065#if DEVELOPMENT || DEBUG
4066 if (p) {
4067 proc_list_lock();
4068 if (set == TRUE) {
4069 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
4070 } else {
4071 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
4072 }
4073 proc_list_unlock();
4074 }
4075#else
4076#pragma unused(p, set)
4077 /*
4078 * do nothing
4079 */
4080#endif /* DEVELOPMENT || DEBUG */
4081 return;
4082}
4083
39037602
A
4084
4085#if CONFIG_JETSAM
fe8ab488
A
4086/*
4087 * This is invoked when cpulimits have been exceeded while in fatal mode.
4088 * The jetsam_flags do not apply as those are for memory related kills.
4089 * We call this routine so that the offending process is killed with
4090 * a non-zero exit status.
4091 */
4092void
4093jetsam_on_ledger_cpulimit_exceeded(void)
4094{
4095 int retval = 0;
4096 int jetsam_flags = 0; /* make it obvious */
4097 proc_t p = current_proc();
39037602 4098 os_reason_t jetsam_reason = OS_REASON_NULL;
fe8ab488
A
4099
4100 printf("task_exceeded_cpulimit: killing pid %d [%s]\n",
39037602 4101 p->p_pid, (*p->p_name ? p->p_name : "(unknown)"));
fe8ab488 4102
39037602
A
4103 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_CPULIMIT);
4104 if (jetsam_reason == OS_REASON_NULL) {
4105 printf("task_exceeded_cpulimit: unable to allocate memory for jetsam reason\n");
4106 }
4107
4108 retval = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
fe8ab488
A
4109
4110 if (retval) {
4111 printf("task_exceeded_cpulimit: failed to kill current task (exiting?).\n");
39236c6e
A
4112 }
4113}
4114
5ba3f43e
A
4115#endif /* CONFIG_JETSAM */
4116
39037602
A
4117static void
4118memorystatus_get_task_memory_region_count(task_t task, uint64_t *count)
4119{
4120 assert(task);
4121 assert(count);
4122
4123 *count = get_task_memory_region_count(task);
4124}
4125
5c9f4661
A
4126#if DEVELOPMENT || DEBUG
4127
4128/*
4129 * Sysctl only used to test memorystatus_allowed_vm_map_fork() path.
4130 * set a new pidwatch value
4131 * or
4132 * get the current pidwatch value
4133 */
4134
4135uint64_t memorystatus_vm_map_fork_pidwatch_val = 0;
4136#define MEMORYSTATUS_VM_MAP_FORK_ALLOWED 0x100000000
4137#define MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED 0x200000000
4138
4139static int sysctl_memorystatus_vm_map_fork_pidwatch SYSCTL_HANDLER_ARGS {
4140#pragma unused(oidp, arg1, arg2)
4141
4142 uint64_t new_value = 0;
4143 uint64_t old_value = 0;
4144 int error = 0;
4145
4146 /*
4147 * The pid is held in the low 32 bits.
4148 * The 'allowed' flags are in the upper 32 bits.
4149 */
4150 old_value = memorystatus_vm_map_fork_pidwatch_val;
4151
4152 error = sysctl_io_number(req, old_value, sizeof(old_value), &new_value, NULL);
4153
4154 if (error || !req->newptr) {
4155 /*
4156 * No new value passed in.
4157 */
4158 return(error);
4159 }
4160
4161 /*
4162 * A new pid was passed in via req->newptr.
4163 * Ignore any attempt to set the higher order bits.
4164 */
4165 memorystatus_vm_map_fork_pidwatch_val = new_value & 0xFFFFFFFF;
4166 printf("memorystatus: pidwatch old_value = 0x%llx, new_value = 0x%llx \n", old_value, new_value);
4167
4168 return(error);
4169}
4170
4171SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_map_fork_pidwatch, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED| CTLFLAG_MASKED,
4172 0, 0, sysctl_memorystatus_vm_map_fork_pidwatch, "Q", "get/set pid watched for in vm_map_fork");
4173
4174
4175#define SET_VM_MAP_FORK_PIDWATCH_ALLOWED(task) \
4176MACRO_BEGIN \
4177if (memorystatus_vm_map_fork_pidwatch_val != 0) { \
4178 proc_t p = get_bsdtask_info(task); \
4179 if (p && (memorystatus_vm_map_fork_pidwatch_val == (uint64_t)p->p_pid)) { \
4180 memorystatus_vm_map_fork_pidwatch_val |= MEMORYSTATUS_VM_MAP_FORK_ALLOWED; \
4181 } \
4182} \
4183MACRO_END
4184
4185#define SET_VM_MAP_FORK_PIDWATCH_NOT_ALLOWED(task) \
4186MACRO_BEGIN \
4187if (memorystatus_vm_map_fork_pidwatch_val != 0) { \
4188 proc_t p = get_bsdtask_info(task); \
4189 if (p && (memorystatus_vm_map_fork_pidwatch_val == (uint64_t)p->p_pid)) { \
4190 memorystatus_vm_map_fork_pidwatch_val |= MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED; \
4191 } \
4192} \
4193MACRO_END
4194
4195#else /* DEVELOPMENT || DEBUG */
4196
4197#define SET_VM_MAP_FORK_PIDWATCH_ALLOWED(task)
4198#define SET_VM_MAP_FORK_PIDWATCH_NOT_ALLOWED(task)
4199
4200#endif /* DEVELOPMENT || DEBUG */
4201
5ba3f43e
A
4202/*
4203 * Called during EXC_RESOURCE handling when a process exceeds a soft
4204 * memory limit. This is the corpse fork path and here we decide if
4205 * vm_map_fork will be allowed when creating the corpse.
5c9f4661 4206 * The task being considered is suspended.
5ba3f43e
A
4207 *
4208 * By default, a vm_map_fork is allowed to proceed.
4209 *
4210 * A few simple policy assumptions:
4211 * Desktop platform is not considered in this path.
4212 * The vm_map_fork is always allowed.
4213 *
4214 * If the device has a zero system-wide task limit,
4215 * then the vm_map_fork is allowed.
4216 *
4217 * And if a process's memory footprint calculates less
4218 * than or equal to half of the system-wide task limit,
4219 * then the vm_map_fork is allowed. This calculation
4220 * is based on the assumption that a process can
4221 * munch memory up to the system-wide task limit.
4222 */
4223boolean_t
4224memorystatus_allowed_vm_map_fork(__unused task_t task)
4225{
4226 boolean_t is_allowed = TRUE; /* default */
4227
4228#if CONFIG_EMBEDDED
4229
4230 uint64_t footprint_in_bytes = 0;
4231 uint64_t purgeable_in_bytes = 0;
4232 uint64_t max_allowed_bytes = 0;
4233
4234 if (max_task_footprint_mb == 0) {
5c9f4661 4235 SET_VM_MAP_FORK_PIDWATCH_ALLOWED(task);
5ba3f43e
A
4236 return (is_allowed);
4237 }
4238
4239 purgeable_in_bytes = get_task_purgeable_size(task);
4240 footprint_in_bytes = get_task_phys_footprint(task);
4241
4242 /*
4243 * Maximum is half the system-wide task limit.
4244 */
4245 max_allowed_bytes = ((((uint64_t)max_task_footprint_mb) * 1024ULL * 1024ULL) >> 1);
4246
4247 if (footprint_in_bytes > purgeable_in_bytes) {
4248 footprint_in_bytes -= purgeable_in_bytes;
4249 }
4250
4251 if (footprint_in_bytes <= max_allowed_bytes) {
5c9f4661 4252 SET_VM_MAP_FORK_PIDWATCH_ALLOWED(task);
5ba3f43e
A
4253 return (is_allowed);
4254 } else {
4255 printf("memorystatus disallowed vm_map_fork %lld %lld\n", footprint_in_bytes, max_allowed_bytes);
5c9f4661 4256 SET_VM_MAP_FORK_PIDWATCH_NOT_ALLOWED(task);
5ba3f43e
A
4257 return (!is_allowed);
4258 }
4259
4260#else /* CONFIG_EMBEDDED */
4261
5c9f4661 4262 SET_VM_MAP_FORK_PIDWATCH_ALLOWED(task);
5ba3f43e
A
4263 return (is_allowed);
4264
4265#endif /* CONFIG_EMBEDDED */
4266
4267}
4268
39236c6e 4269static void
fe8ab488 4270memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages)
39236c6e
A
4271{
4272 assert(task);
4273 assert(footprint);
39037602
A
4274
4275 uint64_t pages;
4276
4277 pages = (get_task_phys_footprint(task) / PAGE_SIZE_64);
4278 assert(((uint32_t)pages) == pages);
4279 *footprint = (uint32_t)pages;
4280
39236c6e 4281 if (max_footprint) {
5ba3f43e 4282 pages = (get_task_phys_footprint_recent_max(task) / PAGE_SIZE_64);
39037602
A
4283 assert(((uint32_t)pages) == pages);
4284 *max_footprint = (uint32_t)pages;
39236c6e 4285 }
fe8ab488 4286 if (max_footprint_lifetime) {
39037602
A
4287 pages = (get_task_resident_max(task) / PAGE_SIZE_64);
4288 assert(((uint32_t)pages) == pages);
4289 *max_footprint_lifetime = (uint32_t)pages;
fe8ab488
A
4290 }
4291 if (purgeable_pages) {
39037602
A
4292 pages = (get_task_purgeable_size(task) / PAGE_SIZE_64);
4293 assert(((uint32_t)pages) == pages);
4294 *purgeable_pages = (uint32_t)pages;
4295 }
4296}
4297
4298static void
4299memorystatus_get_task_phys_footprint_page_counts(task_t task,
4300 uint64_t *internal_pages, uint64_t *internal_compressed_pages,
4301 uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages,
4302 uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages,
4303 uint64_t *iokit_mapped_pages, uint64_t *page_table_pages)
4304{
4305 assert(task);
4306
4307 if (internal_pages) {
4308 *internal_pages = (get_task_internal(task) / PAGE_SIZE_64);
4309 }
4310
4311 if (internal_compressed_pages) {
4312 *internal_compressed_pages = (get_task_internal_compressed(task) / PAGE_SIZE_64);
4313 }
4314
4315 if (purgeable_nonvolatile_pages) {
4316 *purgeable_nonvolatile_pages = (get_task_purgeable_nonvolatile(task) / PAGE_SIZE_64);
4317 }
4318
4319 if (purgeable_nonvolatile_compressed_pages) {
4320 *purgeable_nonvolatile_compressed_pages = (get_task_purgeable_nonvolatile_compressed(task) / PAGE_SIZE_64);
4321 }
4322
4323 if (alternate_accounting_pages) {
4324 *alternate_accounting_pages = (get_task_alternate_accounting(task) / PAGE_SIZE_64);
4325 }
4326
4327 if (alternate_accounting_compressed_pages) {
4328 *alternate_accounting_compressed_pages = (get_task_alternate_accounting_compressed(task) / PAGE_SIZE_64);
4329 }
4330
4331 if (iokit_mapped_pages) {
4332 *iokit_mapped_pages = (get_task_iokit_mapped(task) / PAGE_SIZE_64);
4333 }
4334
4335 if (page_table_pages) {
4336 *page_table_pages = (get_task_page_table(task) / PAGE_SIZE_64);
39236c6e 4337 }
39236c6e
A
4338}
4339
39037602
A
4340/*
4341 * This routine only acts on the global jetsam event snapshot.
4342 * Updating the process's entry can race when the memorystatus_thread
4343 * has chosen to kill a process that is racing to exit on another core.
4344 */
39236c6e 4345static void
39037602 4346memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime)
39236c6e 4347{
39037602
A
4348 memorystatus_jetsam_snapshot_entry_t *entry = NULL;
4349 memorystatus_jetsam_snapshot_t *snapshot = NULL;
4350 memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
4351
39236c6e
A
4352 unsigned int i;
4353
39037602
A
4354 if (memorystatus_jetsam_snapshot_count == 0) {
4355 /*
4356 * No active snapshot.
4357 * Nothing to do.
4358 */
4359 return;
4360 }
4361
4362 /*
4363 * Sanity check as this routine should only be called
4364 * from a jetsam kill path.
4365 */
4366 assert(kill_cause != 0 && killtime != 0);
4367
4368 snapshot = memorystatus_jetsam_snapshot;
4369 snapshot_list = memorystatus_jetsam_snapshot->entries;
4370
39236c6e 4371 for (i = 0; i < memorystatus_jetsam_snapshot_count; i++) {
39037602
A
4372 if (snapshot_list[i].pid == p->p_pid) {
4373
4374 entry = &snapshot_list[i];
4375
4376 if (entry->killed || entry->jse_killtime) {
4377 /*
4378 * We apparently raced on the exit path
4379 * for this process, as it's snapshot entry
4380 * has already recorded a kill.
4381 */
4382 assert(entry->killed && entry->jse_killtime);
4383 break;
4384 }
4385
4386 /*
4387 * Update the entry we just found in the snapshot.
4388 */
4389
4390 entry->killed = kill_cause;
4391 entry->jse_killtime = killtime;
4392 entry->jse_gencount = snapshot->js_gencount;
4393 entry->jse_idle_delta = p->p_memstat_idle_delta;
4394
4395 /*
4396 * If a process has moved between bands since snapshot was
4397 * initialized, then likely these fields changed too.
4398 */
4399 if (entry->priority != p->p_memstat_effectivepriority) {
4400
4401 strlcpy(entry->name, p->p_name, sizeof(entry->name));
4402 entry->priority = p->p_memstat_effectivepriority;
4403 entry->state = memorystatus_build_state(p);
4404 entry->user_data = p->p_memstat_userdata;
4405 entry->fds = p->p_fd->fd_nfiles;
4406 }
4407
4408 /*
4409 * Always update the page counts on a kill.
4410 */
4411
4412 uint32_t pages = 0;
4413 uint32_t max_pages = 0;
4414 uint32_t max_pages_lifetime = 0;
4415 uint32_t purgeable_pages = 0;
4416
4417 memorystatus_get_task_page_counts(p->task, &pages, &max_pages, &max_pages_lifetime, &purgeable_pages);
4418 entry->pages = (uint64_t)pages;
4419 entry->max_pages = (uint64_t)max_pages;
4420 entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
4421 entry->purgeable_pages = (uint64_t)purgeable_pages;
4422
4423 uint64_t internal_pages = 0;
4424 uint64_t internal_compressed_pages = 0;
4425 uint64_t purgeable_nonvolatile_pages = 0;
4426 uint64_t purgeable_nonvolatile_compressed_pages = 0;
4427 uint64_t alternate_accounting_pages = 0;
4428 uint64_t alternate_accounting_compressed_pages = 0;
4429 uint64_t iokit_mapped_pages = 0;
4430 uint64_t page_table_pages = 0;
4431
4432 memorystatus_get_task_phys_footprint_page_counts(p->task, &internal_pages, &internal_compressed_pages,
4433 &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages,
4434 &alternate_accounting_pages, &alternate_accounting_compressed_pages,
4435 &iokit_mapped_pages, &page_table_pages);
4436
4437 entry->jse_internal_pages = internal_pages;
4438 entry->jse_internal_compressed_pages = internal_compressed_pages;
4439 entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages;
4440 entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages;
4441 entry->jse_alternate_accounting_pages = alternate_accounting_pages;
4442 entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages;
4443 entry->jse_iokit_mapped_pages = iokit_mapped_pages;
4444 entry->jse_page_table_pages = page_table_pages;
4445
4446 uint64_t region_count = 0;
4447 memorystatus_get_task_memory_region_count(p->task, &region_count);
4448 entry->jse_memory_region_count = region_count;
4449
4450 goto exit;
4451 }
4452 }
4453
4454 if (entry == NULL) {
4455 /*
4456 * The entry was not found in the snapshot, so the process must have
4457 * launched after the snapshot was initialized.
4458 * Let's try to append the new entry.
4459 */
4460 if (memorystatus_jetsam_snapshot_count < memorystatus_jetsam_snapshot_max) {
4461 /*
4462 * A populated snapshot buffer exists
4463 * and there is room to init a new entry.
4464 */
4465 assert(memorystatus_jetsam_snapshot_count == snapshot->entry_count);
4466
4467 unsigned int next = memorystatus_jetsam_snapshot_count;
4468
4469 if(memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[next], (snapshot->js_gencount)) == TRUE) {
4470
4471 entry = &snapshot_list[next];
4472 entry->killed = kill_cause;
4473 entry->jse_killtime = killtime;
4474
4475 snapshot->entry_count = ++next;
4476 memorystatus_jetsam_snapshot_count = next;
4477
4478 if (memorystatus_jetsam_snapshot_count >= memorystatus_jetsam_snapshot_max) {
4479 /*
4480 * We just used the last slot in the snapshot buffer.
4481 * We only want to log it once... so we do it here
4482 * when we notice we've hit the max.
4483 */
4484 printf("memorystatus: WARNING snapshot buffer is full, count %d\n",
4485 memorystatus_jetsam_snapshot_count);
4486 }
39236c6e 4487 }
39236c6e
A
4488 }
4489 }
39037602
A
4490
4491exit:
4492 if (entry == NULL) {
4493 /*
4494 * If we reach here, the snapshot buffer could not be updated.
4495 * Most likely, the buffer is full, in which case we would have
4496 * logged a warning in the previous call.
4497 *
4498 * For now, we will stop appending snapshot entries.
4499 * When the buffer is consumed, the snapshot state will reset.
4500 */
4501
4502 MEMORYSTATUS_DEBUG(4, "memorystatus_update_jetsam_snapshot_entry_locked: failed to update pid %d, priority %d, count %d\n",
4503 p->p_pid, p->p_memstat_effectivepriority, memorystatus_jetsam_snapshot_count);
4504 }
4505
4506 return;
316670eb 4507}
b0d623f7 4508
5ba3f43e 4509#if CONFIG_JETSAM
39236c6e
A
4510void memorystatus_pages_update(unsigned int pages_avail)
4511{
fe8ab488
A
4512 memorystatus_available_pages = pages_avail;
4513
4514#if VM_PRESSURE_EVENTS
4515 /*
4516 * Since memorystatus_available_pages changes, we should
4517 * re-evaluate the pressure levels on the system and
4518 * check if we need to wake the pressure thread.
4519 * We also update memorystatus_level in that routine.
4520 */
4521 vm_pressure_response();
4522
4523 if (memorystatus_available_pages <= memorystatus_available_pages_pressure) {
4524
4525 if (memorystatus_hwm_candidates || (memorystatus_available_pages <= memorystatus_available_pages_critical)) {
4526 memorystatus_thread_wake();
4527 }
4528 }
4529#else /* VM_PRESSURE_EVENTS */
4530
39236c6e
A
4531 boolean_t critical, delta;
4532
316670eb
A
4533 if (!memorystatus_delta) {
4534 return;
4535 }
39236c6e
A
4536
4537 critical = (pages_avail < memorystatus_available_pages_critical) ? TRUE : FALSE;
4538 delta = ((pages_avail >= (memorystatus_available_pages + memorystatus_delta))
4539 || (memorystatus_available_pages >= (pages_avail + memorystatus_delta))) ? TRUE : FALSE;
4540
4541 if (critical || delta) {
39037602
A
4542 unsigned int total_pages;
4543
4544 total_pages = (unsigned int) atop_64(max_mem);
4545#if CONFIG_SECLUDED_MEMORY
4546 total_pages -= vm_page_secluded_count;
4547#endif /* CONFIG_SECLUDED_MEMORY */
4548 memorystatus_level = memorystatus_available_pages * 100 / total_pages;
39236c6e 4549 memorystatus_thread_wake();
b0d623f7 4550 }
fe8ab488 4551#endif /* VM_PRESSURE_EVENTS */
316670eb 4552}
5ba3f43e 4553#endif /* CONFIG_JETSAM */
316670eb
A
4554
4555static boolean_t
39037602 4556memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount)
316670eb 4557{
fe8ab488
A
4558 clock_sec_t tv_sec;
4559 clock_usec_t tv_usec;
39037602
A
4560 uint32_t pages = 0;
4561 uint32_t max_pages = 0;
4562 uint32_t max_pages_lifetime = 0;
4563 uint32_t purgeable_pages = 0;
4564 uint64_t internal_pages = 0;
4565 uint64_t internal_compressed_pages = 0;
4566 uint64_t purgeable_nonvolatile_pages = 0;
4567 uint64_t purgeable_nonvolatile_compressed_pages = 0;
4568 uint64_t alternate_accounting_pages = 0;
4569 uint64_t alternate_accounting_compressed_pages = 0;
4570 uint64_t iokit_mapped_pages = 0;
4571 uint64_t page_table_pages =0;
4572 uint64_t region_count = 0;
4573 uint64_t cids[COALITION_NUM_TYPES];
fe8ab488 4574
39236c6e 4575 memset(entry, 0, sizeof(memorystatus_jetsam_snapshot_entry_t));
39037602 4576
316670eb 4577 entry->pid = p->p_pid;
39037602 4578 strlcpy(&entry->name[0], p->p_name, sizeof(entry->name));
39236c6e 4579 entry->priority = p->p_memstat_effectivepriority;
39037602
A
4580
4581 memorystatus_get_task_page_counts(p->task, &pages, &max_pages, &max_pages_lifetime, &purgeable_pages);
4582 entry->pages = (uint64_t)pages;
4583 entry->max_pages = (uint64_t)max_pages;
4584 entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
4585 entry->purgeable_pages = (uint64_t)purgeable_pages;
4586
4587 memorystatus_get_task_phys_footprint_page_counts(p->task, &internal_pages, &internal_compressed_pages,
4588 &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages,
4589 &alternate_accounting_pages, &alternate_accounting_compressed_pages,
4590 &iokit_mapped_pages, &page_table_pages);
4591
4592 entry->jse_internal_pages = internal_pages;
4593 entry->jse_internal_compressed_pages = internal_compressed_pages;
4594 entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages;
4595 entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages;
4596 entry->jse_alternate_accounting_pages = alternate_accounting_pages;
4597 entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages;
4598 entry->jse_iokit_mapped_pages = iokit_mapped_pages;
4599 entry->jse_page_table_pages = page_table_pages;
4600
4601 memorystatus_get_task_memory_region_count(p->task, &region_count);
4602 entry->jse_memory_region_count = region_count;
4603
4604 entry->state = memorystatus_build_state(p);
39236c6e 4605 entry->user_data = p->p_memstat_userdata;
316670eb 4606 memcpy(&entry->uuid[0], &p->p_uuid[0], sizeof(p->p_uuid));
39037602 4607 entry->fds = p->p_fd->fd_nfiles;
fe8ab488
A
4608
4609 absolutetime_to_microtime(get_task_cpu_time(p->task), &tv_sec, &tv_usec);
4610 entry->cpu_time.tv_sec = tv_sec;
4611 entry->cpu_time.tv_usec = tv_usec;
316670eb 4612
39037602
A
4613 assert(p->p_stats != NULL);
4614 entry->jse_starttime = p->p_stats->ps_start; /* abstime process started */
4615 entry->jse_killtime = 0; /* abstime jetsam chose to kill process */
4616 entry->killed = 0; /* the jetsam kill cause */
4617 entry->jse_gencount = gencount; /* indicates a pass through jetsam thread, when process was targeted to be killed */
4618
4619 entry->jse_idle_delta = p->p_memstat_idle_delta; /* Most recent timespan spent in idle-band */
4620
4621 proc_coalitionids(p, cids);
4622 entry->jse_coalition_jetsam_id = cids[COALITION_TYPE_JETSAM];
4623
316670eb 4624 return TRUE;
b0d623f7
A
4625}
4626
4627static void
3e170ce0 4628memorystatus_init_snapshot_vmstats(memorystatus_jetsam_snapshot_t *snapshot)
b0d623f7 4629{
39236c6e 4630 kern_return_t kr = KERN_SUCCESS;
39236c6e
A
4631 mach_msg_type_number_t count = HOST_VM_INFO64_COUNT;
4632 vm_statistics64_data_t vm_stat;
4633
4634 if ((kr = host_statistics64(host_self(), HOST_VM_INFO64, (host_info64_t)&vm_stat, &count) != KERN_SUCCESS)) {
3e170ce0
A
4635 printf("memorystatus_init_jetsam_snapshot_stats: host_statistics64 failed with %d\n", kr);
4636 memset(&snapshot->stats, 0, sizeof(snapshot->stats));
39236c6e 4637 } else {
3e170ce0
A
4638 snapshot->stats.free_pages = vm_stat.free_count;
4639 snapshot->stats.active_pages = vm_stat.active_count;
4640 snapshot->stats.inactive_pages = vm_stat.inactive_count;
4641 snapshot->stats.throttled_pages = vm_stat.throttled_count;
4642 snapshot->stats.purgeable_pages = vm_stat.purgeable_count;
4643 snapshot->stats.wired_pages = vm_stat.wire_count;
4644
4645 snapshot->stats.speculative_pages = vm_stat.speculative_count;
4646 snapshot->stats.filebacked_pages = vm_stat.external_page_count;
4647 snapshot->stats.anonymous_pages = vm_stat.internal_page_count;
4648 snapshot->stats.compressions = vm_stat.compressions;
4649 snapshot->stats.decompressions = vm_stat.decompressions;
4650 snapshot->stats.compressor_pages = vm_stat.compressor_page_count;
4651 snapshot->stats.total_uncompressed_pages_in_compressor = vm_stat.total_uncompressed_pages_in_compressor;
4652 }
5ba3f43e
A
4653
4654 get_zone_map_size(&snapshot->stats.zone_map_size, &snapshot->stats.zone_map_capacity);
4655 get_largest_zone_info(snapshot->stats.largest_zone_name, sizeof(snapshot->stats.largest_zone_name),
4656 &snapshot->stats.largest_zone_size);
3e170ce0
A
4657}
4658
4659/*
4660 * Collect vm statistics at boot.
4661 * Called only once (see kern_exec.c)
4662 * Data can be consumed at any time.
4663 */
4664void
4665memorystatus_init_at_boot_snapshot() {
4666 memorystatus_init_snapshot_vmstats(&memorystatus_at_boot_snapshot);
4667 memorystatus_at_boot_snapshot.entry_count = 0;
4668 memorystatus_at_boot_snapshot.notification_time = 0; /* updated when consumed */
4669 memorystatus_at_boot_snapshot.snapshot_time = mach_absolute_time();
4670}
4671
4672static void
4673memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count )
4674{
4675 proc_t p, next_p;
4676 unsigned int b = 0, i = 0;
4677
4678 memorystatus_jetsam_snapshot_t *snapshot = NULL;
4679 memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
4680 unsigned int snapshot_max = 0;
4681
4682 if (od_snapshot) {
4683 /*
4684 * This is an on_demand snapshot
4685 */
4686 snapshot = od_snapshot;
4687 snapshot_list = od_snapshot->entries;
4688 snapshot_max = ods_list_count;
4689 } else {
4690 /*
4691 * This is a jetsam event snapshot
4692 */
4693 snapshot = memorystatus_jetsam_snapshot;
4694 snapshot_list = memorystatus_jetsam_snapshot->entries;
4695 snapshot_max = memorystatus_jetsam_snapshot_max;
39236c6e
A
4696 }
4697
39037602
A
4698 /*
4699 * Init the snapshot header information
4700 */
3e170ce0 4701 memorystatus_init_snapshot_vmstats(snapshot);
39037602
A
4702 snapshot->snapshot_time = mach_absolute_time();
4703 snapshot->notification_time = 0;
4704 snapshot->js_gencount = 0;
3e170ce0 4705
39236c6e
A
4706 next_p = memorystatus_get_first_proc_locked(&b, TRUE);
4707 while (next_p) {
4708 p = next_p;
4709 next_p = memorystatus_get_next_proc_locked(&b, p, TRUE);
4710
39037602 4711 if (FALSE == memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i], snapshot->js_gencount)) {
316670eb
A
4712 continue;
4713 }
4714
3e170ce0 4715 MEMORYSTATUS_DEBUG(0, "jetsam snapshot pid %d, uuid = %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
b0d623f7
A
4716 p->p_pid,
4717 p->p_uuid[0], p->p_uuid[1], p->p_uuid[2], p->p_uuid[3], p->p_uuid[4], p->p_uuid[5], p->p_uuid[6], p->p_uuid[7],
4718 p->p_uuid[8], p->p_uuid[9], p->p_uuid[10], p->p_uuid[11], p->p_uuid[12], p->p_uuid[13], p->p_uuid[14], p->p_uuid[15]);
316670eb 4719
3e170ce0 4720 if (++i == snapshot_max) {
b0d623f7
A
4721 break;
4722 }
4723 }
39236c6e 4724
3e170ce0
A
4725 snapshot->entry_count = i;
4726
4727 if (!od_snapshot) {
4728 /* update the system buffer count */
4729 memorystatus_jetsam_snapshot_count = i;
4730 }
b0d623f7
A
4731}
4732
39236c6e 4733#if DEVELOPMENT || DEBUG
b0d623f7 4734
5ba3f43e 4735#if CONFIG_JETSAM
39236c6e
A
4736static int
4737memorystatus_cmd_set_panic_bits(user_addr_t buffer, uint32_t buffer_size) {
4738 int ret;
4739 memorystatus_jetsam_panic_options_t debug;
4740
4741 if (buffer_size != sizeof(memorystatus_jetsam_panic_options_t)) {
4742 return EINVAL;
b0d623f7 4743 }
39236c6e
A
4744
4745 ret = copyin(buffer, &debug, buffer_size);
4746 if (ret) {
4747 return ret;
4748 }
4749
4750 /* Panic bits match kMemorystatusKilled* enum */
4751 memorystatus_jetsam_panic_debug = (memorystatus_jetsam_panic_debug & ~debug.mask) | (debug.data & debug.mask);
4752
4753 /* Copyout new value */
4754 debug.data = memorystatus_jetsam_panic_debug;
4755 ret = copyout(&debug, buffer, sizeof(memorystatus_jetsam_panic_options_t));
4756
4757 return ret;
b0d623f7 4758}
5ba3f43e 4759#endif /* CONFIG_JETSAM */
b0d623f7 4760
3e170ce0
A
4761/*
4762 * Triggers a sort_order on a specified jetsam priority band.
4763 * This is for testing only, used to force a path through the sort
4764 * function.
4765 */
4766static int
4767memorystatus_cmd_test_jetsam_sort(int priority, int sort_order) {
4768
4769 int error = 0;
4770
4771 unsigned int bucket_index = 0;
4772
4773 if (priority == -1) {
4774 /* Use as shorthand for default priority */
4775 bucket_index = JETSAM_PRIORITY_DEFAULT;
4776 } else {
4777 bucket_index = (unsigned int)priority;
4778 }
4779
4780 error = memorystatus_sort_bucket(bucket_index, sort_order);
4781
4782 return (error);
4783}
4784
39037602 4785#endif /* DEVELOPMENT || DEBUG */
39236c6e
A
4786
4787/*
4788 * Jetsam the first process in the queue.
4789 */
4790static boolean_t
39037602
A
4791memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, os_reason_t jetsam_reason,
4792 int32_t *priority, uint32_t *errors)
39236c6e
A
4793{
4794 pid_t aPid;
4795 proc_t p = PROC_NULL, next_p = PROC_NULL;
5ba3f43e 4796 boolean_t new_snapshot = FALSE, force_new_snapshot = FALSE, killed = FALSE;
3e170ce0 4797 int kill_count = 0;
39236c6e 4798 unsigned int i = 0;
3e170ce0 4799 uint32_t aPid_ep;
39037602
A
4800 uint64_t killtime = 0;
4801 clock_sec_t tv_sec;
4802 clock_usec_t tv_usec;
4803 uint32_t tv_msec;
5ba3f43e 4804 int32_t local_max_kill_prio = JETSAM_PRIORITY_IDLE;
b0d623f7 4805
6d2010ae
A
4806#ifndef CONFIG_FREEZE
4807#pragma unused(any)
4808#endif
316670eb 4809
39236c6e
A
4810 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
4811 memorystatus_available_pages, 0, 0, 0, 0);
6d2010ae 4812
316670eb 4813
5ba3f43e
A
4814#if CONFIG_JETSAM
4815 if (sort_flag == TRUE) {
4816 (void)memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT);
4817 }
4818
4819 local_max_kill_prio = max_kill_priority;
4820
4821 force_new_snapshot = FALSE;
4822
4823#else /* CONFIG_JETSAM */
4824
3e170ce0 4825 if (sort_flag == TRUE) {
5ba3f43e
A
4826 (void)memorystatus_sort_bucket(JETSAM_PRIORITY_IDLE, JETSAM_SORT_DEFAULT);
4827 }
4828
4829 /*
4830 * On macos, we currently only have 2 reasons to be here:
4831 *
4832 * kMemorystatusKilledZoneMapExhaustion
4833 * AND
4834 * kMemorystatusKilledVMThrashing
4835 *
4836 * If we are here because of kMemorystatusKilledZoneMapExhaustion, we will consider
4837 * any and all processes as eligible kill candidates since we need to avoid a panic.
4838 *
4839 * Since this function can be called async. it is harder to toggle the max_kill_priority
4840 * value before and after a call. And so we use this local variable to set the upper band
4841 * on the eligible kill bands.
4842 */
4843 if (cause == kMemorystatusKilledZoneMapExhaustion) {
4844 local_max_kill_prio = JETSAM_PRIORITY_MAX;
4845 } else {
4846 local_max_kill_prio = max_kill_priority;
3e170ce0
A
4847 }
4848
5ba3f43e
A
4849 /*
4850 * And, because we are here under extreme circumstances, we force a snapshot even for
4851 * IDLE kills.
4852 */
4853 force_new_snapshot = TRUE;
4854
4855#endif /* CONFIG_JETSAM */
4856
3e170ce0 4857 proc_list_lock();
fe8ab488 4858
39236c6e 4859 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5ba3f43e 4860 while (next_p && (next_p->p_memstat_effectivepriority <= local_max_kill_prio)) {
316670eb
A
4861#if DEVELOPMENT || DEBUG
4862 int activeProcess;
4863 int procSuspendedForDiagnosis;
4864#endif /* DEVELOPMENT || DEBUG */
39236c6e
A
4865
4866 p = next_p;
4867 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
4868
6d2010ae 4869#if DEVELOPMENT || DEBUG
39236c6e
A
4870 activeProcess = p->p_memstat_state & P_MEMSTAT_FOREGROUND;
4871 procSuspendedForDiagnosis = p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED;
6d2010ae 4872#endif /* DEVELOPMENT || DEBUG */
316670eb 4873
39236c6e 4874 aPid = p->p_pid;
3e170ce0 4875 aPid_ep = p->p_memstat_effectivepriority;
316670eb 4876
39236c6e 4877 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
39037602 4878 continue; /* with lock held */
b0d623f7 4879 }
39236c6e 4880
5ba3f43e 4881#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
39236c6e
A
4882 if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && procSuspendedForDiagnosis) {
4883 printf("jetsam: continuing after ignoring proc suspended already for diagnosis - %d\n", aPid);
4884 continue;
4885 }
5ba3f43e 4886#endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
316670eb 4887
fe8ab488
A
4888 if (cause == kMemorystatusKilledVnodes)
4889 {
4890 /*
4891 * If the system runs out of vnodes, we systematically jetsam
4892 * processes in hopes of stumbling onto a vnode gain that helps
4893 * the system recover. The process that happens to trigger
5ba3f43e
A
4894 * this path has no known relationship to the vnode shortage.
4895 * Deadlock avoidance: attempt to safeguard the caller.
fe8ab488
A
4896 */
4897
4898 if (p == current_proc()) {
4899 /* do not jetsam the current process */
4900 continue;
4901 }
4902 }
4903
6d2010ae 4904#if CONFIG_FREEZE
39236c6e
A
4905 boolean_t skip;
4906 boolean_t reclaim_proc = !(p->p_memstat_state & (P_MEMSTAT_LOCKED | P_MEMSTAT_NORECLAIM));
4907 if (any || reclaim_proc) {
4908 skip = FALSE;
4909 } else {
4910 skip = TRUE;
4911 }
316670eb 4912
39236c6e
A
4913 if (skip) {
4914 continue;
4915 } else
6d2010ae 4916#endif
39236c6e 4917 {
39236c6e
A
4918 /*
4919 * Capture a snapshot if none exists and:
5ba3f43e
A
4920 * - we are forcing a new snapshot creation, either because:
4921 * - on a particular platform we need these snapshots every time, OR
4922 * - a boot-arg/embedded device tree property has been set.
39236c6e
A
4923 * - priority was not requested (this is something other than an ambient kill)
4924 * - the priority was requested *and* the targeted process is not at idle priority
4925 */
4926 if ((memorystatus_jetsam_snapshot_count == 0) &&
5ba3f43e 4927 (force_new_snapshot || memorystatus_idle_snapshot || ((!priority) || (priority && (aPid_ep != JETSAM_PRIORITY_IDLE))))) {
3e170ce0 4928 memorystatus_init_jetsam_snapshot_locked(NULL,0);
39236c6e
A
4929 new_snapshot = TRUE;
4930 }
4931
4932 /*
4933 * Mark as terminated so that if exit1() indicates success, but the process (for example)
4934 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
4935 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
4936 * acquisition of the proc lock.
4937 */
4938 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
39037602
A
4939
4940 killtime = mach_absolute_time();
4941 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
4942 tv_msec = tv_usec / 1000;
39236c6e 4943
5ba3f43e 4944#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
39236c6e
A
4945 if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && activeProcess) {
4946 MEMORYSTATUS_DEBUG(1, "jetsam: suspending pid %d [%s] (active) for diagnosis - memory_status_level: %d\n",
39037602
A
4947 aPid, (*p->p_name ? p->p_name: "(unknown)"), memorystatus_level);
4948 memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusKilledDiagnostic, killtime);
39236c6e
A
4949 p->p_memstat_state |= P_MEMSTAT_DIAG_SUSPENDED;
4950 if (memorystatus_jetsam_policy & kPolicyDiagnoseFirst) {
4951 jetsam_diagnostic_suspended_one_active_proc = 1;
4952 printf("jetsam: returning after suspending first active proc - %d\n", aPid);
4953 }
4954
4955 p = proc_ref_locked(p);
4956 proc_list_unlock();
4957 if (p) {
316670eb 4958 task_suspend(p->task);
3e170ce0
A
4959 if (priority) {
4960 *priority = aPid_ep;
4961 }
316670eb 4962 proc_rele(p);
39236c6e
A
4963 killed = TRUE;
4964 }
4965
4966 goto exit;
4967 } else
5ba3f43e 4968#endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
39236c6e
A
4969 {
4970 /* Shift queue, update stats */
39037602 4971 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
3e170ce0
A
4972
4973 if (proc_ref_locked(p) == p) {
4974 proc_list_unlock();
5ba3f43e 4975 os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: %s pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n",
39037602 4976 (unsigned long)tv_sec, tv_msec,
5ba3f43e
A
4977 ((aPid_ep == JETSAM_PRIORITY_IDLE) ? "killing_idle_process" : "killing_top_process"),
4978 aPid, (*p->p_name ? p->p_name : "unknown"),
4979 memorystatus_kill_cause_name[cause], aPid_ep, (uint64_t)memorystatus_available_pages);
3e170ce0 4980
39037602
A
4981 /*
4982 * memorystatus_do_kill() drops a reference, so take another one so we can
4983 * continue to use this exit reason even after memorystatus_do_kill()
4984 * returns.
4985 */
4986 os_reason_ref(jetsam_reason);
4987
4988 killed = memorystatus_do_kill(p, cause, jetsam_reason);
3e170ce0
A
4989
4990 /* Success? */
4991 if (killed) {
4992 if (priority) {
4993 *priority = aPid_ep;
4994 }
4995 proc_rele(p);
4996 kill_count++;
4997 goto exit;
4998 }
39236c6e 4999
3e170ce0
A
5000 /*
5001 * Failure - first unwind the state,
5002 * then fall through to restart the search.
5003 */
5004 proc_list_lock();
5005 proc_rele_locked(p);
5006 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
5007 p->p_memstat_state |= P_MEMSTAT_ERROR;
5008 *errors += 1;
6d2010ae 5009 }
39236c6e 5010
3e170ce0
A
5011 /*
5012 * Failure - restart the search.
5013 *
5014 * We might have raced with "p" exiting on another core, resulting in no
5015 * ref on "p". Or, we may have failed to kill "p".
5016 *
5017 * Either way, we fall thru to here, leaving the proc in the
5018 * P_MEMSTAT_TERMINATED state.
5019 *
5020 * And, we hold the the proc_list_lock at this point.
5021 */
5022
39236c6e
A
5023 i = 0;
5024 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6d2010ae 5025 }
b0d623f7 5026 }
b0d623f7 5027 }
316670eb 5028
39236c6e 5029 proc_list_unlock();
316670eb 5030
39236c6e 5031exit:
39037602
A
5032 os_reason_free(jetsam_reason);
5033
39236c6e
A
5034 /* Clear snapshot if freshly captured and no target was found */
5035 if (new_snapshot && !killed) {
39037602
A
5036 proc_list_lock();
5037 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
5038 proc_list_unlock();
316670eb
A
5039 }
5040
39236c6e 5041 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
3e170ce0 5042 memorystatus_available_pages, killed ? aPid : 0, kill_count, 0, 0);
b0d623f7 5043
39236c6e 5044 return killed;
316670eb
A
5045}
5046
3e170ce0
A
5047/*
5048 * Jetsam aggressively
5049 */
39236c6e 5050static boolean_t
5ba3f43e 5051memorystatus_kill_top_process_aggressive(uint32_t cause, int aggr_count,
39037602 5052 int32_t priority_max, uint32_t *errors)
d1ecb069 5053{
3e170ce0 5054 pid_t aPid;
39236c6e
A
5055 proc_t p = PROC_NULL, next_p = PROC_NULL;
5056 boolean_t new_snapshot = FALSE, killed = FALSE;
3e170ce0 5057 int kill_count = 0;
39236c6e 5058 unsigned int i = 0;
3e170ce0 5059 int32_t aPid_ep = 0;
490019cf 5060 unsigned int memorystatus_level_snapshot = 0;
39037602
A
5061 uint64_t killtime = 0;
5062 clock_sec_t tv_sec;
5063 clock_usec_t tv_usec;
5064 uint32_t tv_msec;
5ba3f43e 5065 os_reason_t jetsam_reason = OS_REASON_NULL;
3e170ce0
A
5066
5067 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
5068 memorystatus_available_pages, priority_max, 0, 0, 0);
5069
490019cf
A
5070 memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT);
5071
5ba3f43e
A
5072 jetsam_reason = os_reason_create(OS_REASON_JETSAM, cause);
5073 if (jetsam_reason == OS_REASON_NULL) {
5074 printf("memorystatus_kill_top_process_aggressive: failed to allocate exit reason\n");
5075 }
5076
39236c6e 5077 proc_list_lock();
3e170ce0 5078
39236c6e
A
5079 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5080 while (next_p) {
3e170ce0
A
5081#if DEVELOPMENT || DEBUG
5082 int activeProcess;
5083 int procSuspendedForDiagnosis;
5084#endif /* DEVELOPMENT || DEBUG */
39236c6e 5085
5ba3f43e
A
5086 if (((next_p->p_listflag & P_LIST_EXITED) != 0) ||
5087 ((unsigned int)(next_p->p_memstat_effectivepriority) != i)) {
3e170ce0
A
5088
5089 /*
5ba3f43e
A
5090 * We have raced with next_p running on another core.
5091 * It may be exiting or it may have moved to a different
5092 * jetsam priority band. This means we have lost our
5093 * place in line while traversing the jetsam list. We
3e170ce0
A
5094 * attempt to recover by rewinding to the beginning of the band
5095 * we were already traversing. By doing this, we do not guarantee
5096 * that no process escapes this aggressive march, but we can make
5097 * skipping an entire range of processes less likely. (PR-21069019)
5098 */
5099
5ba3f43e
A
5100 MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: rewinding band %d, %s(%d) moved or exiting.\n",
5101 aggr_count, i, (*next_p->p_name ? next_p->p_name : "unknown"), next_p->p_pid);
3e170ce0
A
5102
5103 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5104 continue;
5105 }
5106
5107 p = next_p;
5108 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
5109
5110 if (p->p_memstat_effectivepriority > priority_max) {
5111 /*
5112 * Bail out of this killing spree if we have
5113 * reached beyond the priority_max jetsam band.
5114 * That is, we kill up to and through the
5115 * priority_max jetsam band.
5116 */
5117 proc_list_unlock();
5118 goto exit;
5119 }
5120
5121#if DEVELOPMENT || DEBUG
5122 activeProcess = p->p_memstat_state & P_MEMSTAT_FOREGROUND;
5123 procSuspendedForDiagnosis = p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED;
5124#endif /* DEVELOPMENT || DEBUG */
5125
5126 aPid = p->p_pid;
5127 aPid_ep = p->p_memstat_effectivepriority;
5128
5129 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
5130 continue;
5131 }
5132
5ba3f43e 5133#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
3e170ce0
A
5134 if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && procSuspendedForDiagnosis) {
5135 printf("jetsam: continuing after ignoring proc suspended already for diagnosis - %d\n", aPid);
5136 continue;
5137 }
5ba3f43e 5138#endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
3e170ce0
A
5139
5140 /*
5141 * Capture a snapshot if none exists.
5142 */
5143 if (memorystatus_jetsam_snapshot_count == 0) {
5144 memorystatus_init_jetsam_snapshot_locked(NULL,0);
5145 new_snapshot = TRUE;
5146 }
5147
5148 /*
5149 * Mark as terminated so that if exit1() indicates success, but the process (for example)
5150 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
5151 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
5152 * acquisition of the proc lock.
5153 */
5154 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
39037602
A
5155
5156 killtime = mach_absolute_time();
5157 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
5158 tv_msec = tv_usec / 1000;
3e170ce0
A
5159
5160 /* Shift queue, update stats */
39037602 5161 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
3e170ce0
A
5162
5163 /*
5164 * In order to kill the target process, we will drop the proc_list_lock.
5165 * To guaranteee that p and next_p don't disappear out from under the lock,
5166 * we must take a ref on both.
5167 * If we cannot get a reference, then it's likely we've raced with
5168 * that process exiting on another core.
5169 */
5170 if (proc_ref_locked(p) == p) {
5171 if (next_p) {
5172 while (next_p && (proc_ref_locked(next_p) != next_p)) {
5173 proc_t temp_p;
5174
5175 /*
5176 * We must have raced with next_p exiting on another core.
5177 * Recover by getting the next eligible process in the band.
5178 */
5179
5180 MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: skipping %d [%s] (exiting?)\n",
39037602 5181 aggr_count, next_p->p_pid, (*next_p->p_name ? next_p->p_name : "(unknown)"));
3e170ce0
A
5182
5183 temp_p = next_p;
5184 next_p = memorystatus_get_next_proc_locked(&i, temp_p, TRUE);
5185 }
5186 }
5187 proc_list_unlock();
5188
5ba3f43e
A
5189 printf("%lu.%03d memorystatus: %s%d pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n",
5190 (unsigned long)tv_sec, tv_msec,
5191 ((aPid_ep == JETSAM_PRIORITY_IDLE) ? "killing_idle_process_aggressive" : "killing_top_process_aggressive"),
5192 aggr_count, aPid, (*p->p_name ? p->p_name : "unknown"),
5193 memorystatus_kill_cause_name[cause], aPid_ep, (uint64_t)memorystatus_available_pages);
3e170ce0 5194
490019cf
A
5195 memorystatus_level_snapshot = memorystatus_level;
5196
39037602
A
5197 /*
5198 * memorystatus_do_kill() drops a reference, so take another one so we can
5199 * continue to use this exit reason even after memorystatus_do_kill()
5200 * returns.
5201 */
5202 os_reason_ref(jetsam_reason);
5203 killed = memorystatus_do_kill(p, cause, jetsam_reason);
5204
3e170ce0
A
5205 /* Success? */
5206 if (killed) {
5207 proc_rele(p);
5208 kill_count++;
5209 p = NULL;
5210 killed = FALSE;
5211
5212 /*
5213 * Continue the killing spree.
5214 */
5215 proc_list_lock();
5216 if (next_p) {
5217 proc_rele_locked(next_p);
5218 }
490019cf
A
5219
5220 if (aPid_ep == JETSAM_PRIORITY_FOREGROUND && memorystatus_aggressive_jetsam_lenient == TRUE) {
5221 if (memorystatus_level > memorystatus_level_snapshot && ((memorystatus_level - memorystatus_level_snapshot) >= AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD)) {
5222#if DEVELOPMENT || DEBUG
5223 printf("Disabling Lenient mode after one-time deployment.\n");
5224#endif /* DEVELOPMENT || DEBUG */
5225 memorystatus_aggressive_jetsam_lenient = FALSE;
5226 break;
5227 }
5228 }
5229
3e170ce0
A
5230 continue;
5231 }
5232
5233 /*
5234 * Failure - first unwind the state,
5235 * then fall through to restart the search.
5236 */
5237 proc_list_lock();
5238 proc_rele_locked(p);
5239 if (next_p) {
5240 proc_rele_locked(next_p);
5241 }
5242 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
5243 p->p_memstat_state |= P_MEMSTAT_ERROR;
5244 *errors += 1;
5245 p = NULL;
5246 }
5247
5248 /*
5249 * Failure - restart the search at the beginning of
5250 * the band we were already traversing.
5251 *
5252 * We might have raced with "p" exiting on another core, resulting in no
5253 * ref on "p". Or, we may have failed to kill "p".
5254 *
5255 * Either way, we fall thru to here, leaving the proc in the
5256 * P_MEMSTAT_TERMINATED or P_MEMSTAT_ERROR state.
5257 *
5258 * And, we hold the the proc_list_lock at this point.
5259 */
5260
5261 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5262 }
5263
5264 proc_list_unlock();
5265
5266exit:
39037602
A
5267 os_reason_free(jetsam_reason);
5268
3e170ce0
A
5269 /* Clear snapshot if freshly captured and no target was found */
5270 if (new_snapshot && (kill_count == 0)) {
5271 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
5272 }
5273
5274 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
5275 memorystatus_available_pages, killed ? aPid : 0, kill_count, 0, 0);
5276
5277 if (kill_count > 0) {
5278 return(TRUE);
5279 }
5280 else {
5281 return(FALSE);
5282 }
5283}
5284
3e170ce0
A
5285static boolean_t
5286memorystatus_kill_hiwat_proc(uint32_t *errors)
5287{
5288 pid_t aPid = 0;
5289 proc_t p = PROC_NULL, next_p = PROC_NULL;
5290 boolean_t new_snapshot = FALSE, killed = FALSE;
5291 int kill_count = 0;
5292 unsigned int i = 0;
5293 uint32_t aPid_ep;
39037602
A
5294 uint64_t killtime = 0;
5295 clock_sec_t tv_sec;
5296 clock_usec_t tv_usec;
5297 uint32_t tv_msec;
5298 os_reason_t jetsam_reason = OS_REASON_NULL;
3e170ce0
A
5299 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_START,
5300 memorystatus_available_pages, 0, 0, 0, 0);
5301
39037602
A
5302 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_HIGHWATER);
5303 if (jetsam_reason == OS_REASON_NULL) {
5304 printf("memorystatus_kill_hiwat_proc: failed to allocate exit reason\n");
5305 }
5306
3e170ce0
A
5307 proc_list_lock();
5308
5309 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5310 while (next_p) {
39037602
A
5311 uint64_t footprint_in_bytes = 0;
5312 uint64_t memlimit_in_bytes = 0;
5313 boolean_t skip = 0;
3e170ce0
A
5314
5315 p = next_p;
5316 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
5317
39236c6e 5318 aPid = p->p_pid;
3e170ce0 5319 aPid_ep = p->p_memstat_effectivepriority;
316670eb 5320
39236c6e
A
5321 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
5322 continue;
5323 }
5324
5325 /* skip if no limit set */
5326 if (p->p_memstat_memlimit <= 0) {
5327 continue;
d1ecb069 5328 }
3e170ce0 5329
39037602
A
5330 footprint_in_bytes = get_task_phys_footprint(p->task);
5331 memlimit_in_bytes = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL); /* convert MB to bytes */
5332 skip = (footprint_in_bytes <= memlimit_in_bytes);
3e170ce0 5333
5ba3f43e 5334#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
39236c6e
A
5335 if (!skip && (memorystatus_jetsam_policy & kPolicyDiagnoseActive)) {
5336 if (p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED) {
5337 continue;
6d2010ae 5338 }
39236c6e 5339 }
5ba3f43e 5340#endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
316670eb 5341
6d2010ae 5342#if CONFIG_FREEZE
39236c6e
A
5343 if (!skip) {
5344 if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
5345 skip = TRUE;
5346 } else {
5347 skip = FALSE;
5348 }
5349 }
6d2010ae 5350#endif
316670eb 5351
39236c6e
A
5352 if (skip) {
5353 continue;
5354 } else {
5ba3f43e 5355#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
39037602
A
5356 MEMORYSTATUS_DEBUG(1, "jetsam: %s pid %d [%s] - %lld Mb > 1 (%d Mb)\n",
5357 (memorystatus_jetsam_policy & kPolicyDiagnoseActive) ? "suspending": "killing",
5358 aPid, (*p->p_name ? p->p_name : "unknown"),
5359 (footprint_in_bytes / (1024ULL * 1024ULL)), /* converted bytes to MB */
5360 p->p_memstat_memlimit);
5ba3f43e 5361#endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
39236c6e
A
5362
5363 if (memorystatus_jetsam_snapshot_count == 0) {
3e170ce0 5364 memorystatus_init_jetsam_snapshot_locked(NULL,0);
39236c6e
A
5365 new_snapshot = TRUE;
5366 }
5367
5368 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
39037602
A
5369
5370 killtime = mach_absolute_time();
5371 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
5372 tv_msec = tv_usec / 1000;
39236c6e 5373
5ba3f43e 5374#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
39236c6e
A
5375 if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
5376 MEMORYSTATUS_DEBUG(1, "jetsam: pid %d suspended for diagnosis - memorystatus_available_pages: %d\n", aPid, memorystatus_available_pages);
39037602 5377 memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusKilledDiagnostic, killtime);
39236c6e
A
5378 p->p_memstat_state |= P_MEMSTAT_DIAG_SUSPENDED;
5379
5380 p = proc_ref_locked(p);
5381 proc_list_unlock();
5382 if (p) {
6d2010ae
A
5383 task_suspend(p->task);
5384 proc_rele(p);
39236c6e
A
5385 killed = TRUE;
5386 }
5387
5388 goto exit;
5389 } else
5ba3f43e 5390#endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
39236c6e 5391 {
39037602 5392 memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusKilledHiwat, killtime);
39236c6e 5393
3e170ce0
A
5394 if (proc_ref_locked(p) == p) {
5395 proc_list_unlock();
5396
5ba3f43e
A
5397 os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: killing_highwater_process pid %d [%s] (highwater %d) - memorystatus_available_pages: %llu\n",
5398 (unsigned long)tv_sec, tv_msec, aPid, (*p->p_name ? p->p_name : "unknown"), aPid_ep, (uint64_t)memorystatus_available_pages);
39037602
A
5399
5400 /*
5401 * memorystatus_do_kill drops a reference, so take another one so we can
5402 * continue to use this exit reason even after memorystatus_do_kill()
5403 * returns
5404 */
5405 os_reason_ref(jetsam_reason);
5406
5407 killed = memorystatus_do_kill(p, kMemorystatusKilledHiwat, jetsam_reason);
3e170ce0 5408
3e170ce0
A
5409 /* Success? */
5410 if (killed) {
5411 proc_rele(p);
5412 kill_count++;
5413 goto exit;
5414 }
5415
5416 /*
5417 * Failure - first unwind the state,
5418 * then fall through to restart the search.
5419 */
5420 proc_list_lock();
5421 proc_rele_locked(p);
5422 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
5423 p->p_memstat_state |= P_MEMSTAT_ERROR;
5424 *errors += 1;
6d2010ae 5425 }
6d2010ae 5426
3e170ce0
A
5427 /*
5428 * Failure - restart the search.
5429 *
5430 * We might have raced with "p" exiting on another core, resulting in no
5431 * ref on "p". Or, we may have failed to kill "p".
5432 *
5433 * Either way, we fall thru to here, leaving the proc in the
5434 * P_MEMSTAT_TERMINATED state.
5435 *
5436 * And, we hold the the proc_list_lock at this point.
5437 */
5438
39236c6e
A
5439 i = 0;
5440 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5441 }
6d2010ae
A
5442 }
5443 }
316670eb 5444
39236c6e 5445 proc_list_unlock();
316670eb 5446
39236c6e 5447exit:
39037602
A
5448 os_reason_free(jetsam_reason);
5449
5450 /* Clear snapshot if freshly captured and no target was found */
5451 if (new_snapshot && !killed) {
5452 proc_list_lock();
5453 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
5454 proc_list_unlock();
5455 }
5456
5457 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_END,
5458 memorystatus_available_pages, killed ? aPid : 0, kill_count, 0, 0);
5459
5460 return killed;
5461}
5462
5463/*
5464 * Jetsam a process pinned in the elevated band.
5465 *
5466 * Return: true -- at least one pinned process was jetsammed
5467 * false -- no pinned process was jetsammed
5468 */
5469static boolean_t
5470memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, int aggr_count, uint32_t *errors)
5471{
5472 pid_t aPid = 0;
5473 proc_t p = PROC_NULL, next_p = PROC_NULL;
5474 boolean_t new_snapshot = FALSE, killed = FALSE;
5475 int kill_count = 0;
5476 unsigned int i = JETSAM_PRIORITY_ELEVATED_INACTIVE;
5477 uint32_t aPid_ep;
5478 uint64_t killtime = 0;
5479 clock_sec_t tv_sec;
5480 clock_usec_t tv_usec;
5481 uint32_t tv_msec;
5482
5483
5484 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
5485 memorystatus_available_pages, 0, 0, 0, 0);
5486
5487 proc_list_lock();
5488
5489 next_p = memorystatus_get_first_proc_locked(&i, FALSE);
5490 while (next_p) {
5491
5492 p = next_p;
5493 next_p = memorystatus_get_next_proc_locked(&i, p, FALSE);
5494
5495 aPid = p->p_pid;
5496 aPid_ep = p->p_memstat_effectivepriority;
5497
5498 /*
5499 * Only pick a process pinned in this elevated band
5500 */
5501 if (!(p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) {
5502 continue;
5503 }
5504
5505 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
5506 continue;
5507 }
5508
5509#if CONFIG_FREEZE
5510 if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
5511 continue;
5512 }
5513#endif
5514
5515#if DEVELOPMENT || DEBUG
5516 MEMORYSTATUS_DEBUG(1, "jetsam: elevated%d process pid %d [%s] - memorystatus_available_pages: %d\n",
5517 aggr_count,
5518 aPid, (*p->p_name ? p->p_name : "unknown"),
5519 memorystatus_available_pages);
5520#endif /* DEVELOPMENT || DEBUG */
5521
5522 if (memorystatus_jetsam_snapshot_count == 0) {
5523 memorystatus_init_jetsam_snapshot_locked(NULL,0);
5524 new_snapshot = TRUE;
5525 }
5526
5527 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
5528
5529 killtime = mach_absolute_time();
5530 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
5531 tv_msec = tv_usec / 1000;
5532
5533 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
5534
5535 if (proc_ref_locked(p) == p) {
5536
5537 proc_list_unlock();
5538
5ba3f43e 5539 os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: killing_top_process_elevated%d pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n",
39037602
A
5540 (unsigned long)tv_sec, tv_msec,
5541 aggr_count,
5ba3f43e
A
5542 aPid, (*p->p_name ? p->p_name : "unknown"),
5543 memorystatus_kill_cause_name[cause], aPid_ep, (uint64_t)memorystatus_available_pages);
39037602
A
5544
5545 /*
5546 * memorystatus_do_kill drops a reference, so take another one so we can
5547 * continue to use this exit reason even after memorystatus_do_kill()
5548 * returns
5549 */
5550 os_reason_ref(jetsam_reason);
5551 killed = memorystatus_do_kill(p, cause, jetsam_reason);
5552
5553 /* Success? */
5554 if (killed) {
5555 proc_rele(p);
5556 kill_count++;
5557 goto exit;
5558 }
5559
5560 /*
5561 * Failure - first unwind the state,
5562 * then fall through to restart the search.
5563 */
5564 proc_list_lock();
5565 proc_rele_locked(p);
5566 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
5567 p->p_memstat_state |= P_MEMSTAT_ERROR;
5568 *errors += 1;
5569 }
5570
5571 /*
5572 * Failure - restart the search.
5573 *
5574 * We might have raced with "p" exiting on another core, resulting in no
5575 * ref on "p". Or, we may have failed to kill "p".
5576 *
5577 * Either way, we fall thru to here, leaving the proc in the
5578 * P_MEMSTAT_TERMINATED state or P_MEMSTAT_ERROR state.
5579 *
5580 * And, we hold the the proc_list_lock at this point.
5581 */
5582
5583 next_p = memorystatus_get_first_proc_locked(&i, FALSE);
5584 }
5585
5586 proc_list_unlock();
5587
5588exit:
5589 os_reason_free(jetsam_reason);
5590
39236c6e 5591 /* Clear snapshot if freshly captured and no target was found */
39037602
A
5592 if (new_snapshot && (kill_count == 0)) {
5593 proc_list_lock();
39236c6e 5594 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
39037602 5595 proc_list_unlock();
316670eb 5596 }
39037602
A
5597
5598 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
3e170ce0 5599 memorystatus_available_pages, killed ? aPid : 0, kill_count, 0, 0);
6d2010ae 5600
39037602 5601 return (killed);
316670eb 5602}
2d21ac55 5603
39236c6e
A
5604static boolean_t
5605memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause) {
39037602
A
5606 /*
5607 * TODO: allow a general async path
5608 *
5609 * NOTE: If a new async kill cause is added, make sure to update memorystatus_thread() to
5610 * add the appropriate exit reason code mapping.
5611 */
fe8ab488 5612 if ((victim_pid != -1) || (cause != kMemorystatusKilledVMPageShortage && cause != kMemorystatusKilledVMThrashing &&
5ba3f43e 5613 cause != kMemorystatusKilledFCThrashing && cause != kMemorystatusKilledZoneMapExhaustion)) {
39236c6e 5614 return FALSE;
316670eb 5615 }
39236c6e 5616
fe8ab488 5617 kill_under_pressure_cause = cause;
39236c6e
A
5618 memorystatus_thread_wake();
5619 return TRUE;
5620}
2d21ac55 5621
5ba3f43e
A
5622boolean_t
5623memorystatus_kill_on_VM_thrashing(boolean_t async) {
39236c6e 5624 if (async) {
5ba3f43e 5625 return memorystatus_kill_process_async(-1, kMemorystatusKilledVMThrashing);
39236c6e 5626 } else {
5ba3f43e 5627 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMTHRASHING);
39037602 5628 if (jetsam_reason == OS_REASON_NULL) {
5ba3f43e 5629 printf("memorystatus_kill_on_VM_thrashing -- sync: failed to allocate jetsam reason\n");
39037602
A
5630 }
5631
5ba3f43e 5632 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMThrashing, jetsam_reason);
39236c6e
A
5633 }
5634}
2d21ac55 5635
5ba3f43e
A
5636#if CONFIG_JETSAM
5637boolean_t
5638memorystatus_kill_on_VM_page_shortage(boolean_t async) {
39236c6e 5639 if (async) {
5ba3f43e 5640 return memorystatus_kill_process_async(-1, kMemorystatusKilledVMPageShortage);
39236c6e 5641 } else {
5ba3f43e 5642 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMPAGESHORTAGE);
39037602 5643 if (jetsam_reason == OS_REASON_NULL) {
5ba3f43e 5644 printf("memorystatus_kill_on_VM_page_shortage -- sync: failed to allocate jetsam reason\n");
39037602
A
5645 }
5646
5ba3f43e 5647 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMPageShortage, jetsam_reason);
2d21ac55
A
5648 }
5649}
b0d623f7 5650
fe8ab488
A
5651boolean_t
5652memorystatus_kill_on_FC_thrashing(boolean_t async) {
39037602
A
5653
5654
fe8ab488
A
5655 if (async) {
5656 return memorystatus_kill_process_async(-1, kMemorystatusKilledFCThrashing);
5657 } else {
39037602
A
5658 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_FCTHRASHING);
5659 if (jetsam_reason == OS_REASON_NULL) {
5660 printf("memorystatus_kill_on_FC_thrashing -- sync: failed to allocate jetsam reason\n");
5661 }
5662
5663 return memorystatus_kill_process_sync(-1, kMemorystatusKilledFCThrashing, jetsam_reason);
fe8ab488
A
5664 }
5665}
5666
39236c6e
A
5667boolean_t
5668memorystatus_kill_on_vnode_limit(void) {
39037602
A
5669 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_VNODE);
5670 if (jetsam_reason == OS_REASON_NULL) {
5671 printf("memorystatus_kill_on_vnode_limit: failed to allocate jetsam reason\n");
5672 }
5673
5674 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVnodes, jetsam_reason);
39236c6e
A
5675}
5676
316670eb
A
5677#endif /* CONFIG_JETSAM */
5678
5ba3f43e
A
5679boolean_t
5680memorystatus_kill_on_zone_map_exhaustion(pid_t pid) {
5681 boolean_t res = FALSE;
5682 if (pid == -1) {
5683 res = memorystatus_kill_process_async(-1, kMemorystatusKilledZoneMapExhaustion);
5684 } else {
5685 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_ZONE_MAP_EXHAUSTION);
5686 if (jetsam_reason == OS_REASON_NULL) {
5687 printf("memorystatus_kill_on_zone_map_exhaustion: failed to allocate jetsam reason\n");
5688 }
5689
5690 res = memorystatus_kill_process_sync(pid, kMemorystatusKilledZoneMapExhaustion, jetsam_reason);
5691 }
5692 return res;
5693}
5694
6d2010ae
A
5695#if CONFIG_FREEZE
5696
5697__private_extern__ void
316670eb 5698memorystatus_freeze_init(void)
6d2010ae 5699{
316670eb
A
5700 kern_return_t result;
5701 thread_t thread;
3e170ce0
A
5702
5703 freezer_lck_grp_attr = lck_grp_attr_alloc_init();
5704 freezer_lck_grp = lck_grp_alloc_init("freezer", freezer_lck_grp_attr);
5705
5706 lck_mtx_init(&freezer_mutex, freezer_lck_grp, NULL);
39236c6e 5707
316670eb
A
5708 result = kernel_thread_start(memorystatus_freeze_thread, NULL, &thread);
5709 if (result == KERN_SUCCESS) {
5710 thread_deallocate(thread);
5711 } else {
5712 panic("Could not create memorystatus_freeze_thread");
5713 }
6d2010ae
A
5714}
5715
3e170ce0
A
5716/*
5717 * Synchronously freeze the passed proc. Called with a reference to the proc held.
5718 *
5719 * Returns EINVAL or the value returned by task_freeze().
5720 */
5721int
5722memorystatus_freeze_process_sync(proc_t p)
5723{
5724 int ret = EINVAL;
5725 pid_t aPid = 0;
5726 boolean_t memorystatus_freeze_swap_low = FALSE;
5727
5728 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START,
5729 memorystatus_available_pages, 0, 0, 0, 0);
5730
5731 lck_mtx_lock(&freezer_mutex);
5732
5733 if (p == NULL) {
5734 goto exit;
5735 }
5736
5737 if (memorystatus_freeze_enabled == FALSE) {
5738 goto exit;
5739 }
5740
5741 if (!memorystatus_can_freeze(&memorystatus_freeze_swap_low)) {
5742 goto exit;
5743 }
5744
5745 if (memorystatus_freeze_update_throttle()) {
5746 printf("memorystatus_freeze_process_sync: in throttle, ignorning freeze\n");
5747 memorystatus_freeze_throttle_count++;
5748 goto exit;
5749 }
5750
5751 proc_list_lock();
5752
5753 if (p != NULL) {
5754 uint32_t purgeable, wired, clean, dirty, state;
5755 uint32_t max_pages, pages, i;
5756 boolean_t shared;
5757
5758 aPid = p->p_pid;
5759 state = p->p_memstat_state;
5760
5761 /* Ensure the process is eligible for freezing */
5762 if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FROZEN)) || !(state & P_MEMSTAT_SUSPENDED)) {
5763 proc_list_unlock();
5764 goto exit;
5765 }
5766
5767 /* Only freeze processes meeting our minimum resident page criteria */
5768 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
5769 if (pages < memorystatus_freeze_pages_min) {
5770 proc_list_unlock();
5771 goto exit;
5772 }
5773
39037602 5774 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
3e170ce0
A
5775
5776 unsigned int avail_swap_space = 0; /* in pages. */
5777
39037602
A
5778 /*
5779 * Freezer backed by the compressor and swap file(s)
5780 * while will hold compressed data.
5781 */
5782 avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64;
3e170ce0
A
5783
5784 max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max);
5785
5786 if (max_pages < memorystatus_freeze_pages_min) {
5787 proc_list_unlock();
5788 goto exit;
5789 }
5790 } else {
5791 /*
5792 * We only have the compressor without any swap.
5793 */
5794 max_pages = UINT32_MAX - 1;
5795 }
5796
5797 /* Mark as locked temporarily to avoid kill */
5798 p->p_memstat_state |= P_MEMSTAT_LOCKED;
5799 proc_list_unlock();
5800
5801 ret = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
5802
39037602
A
5803 DTRACE_MEMORYSTATUS6(memorystatus_freeze, proc_t, p, unsigned int, memorystatus_available_pages, boolean_t, purgeable, unsigned int, wired, uint32_t, clean, uint32_t, dirty);
5804
3e170ce0 5805 MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_process_sync: task_freeze %s for pid %d [%s] - "
39037602
A
5806 "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, max_pages %d, shared %d\n",
5807 (ret == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (*p->p_name ? p->p_name : "(unknown)"),
5808 memorystatus_available_pages, purgeable, wired, clean, dirty, max_pages, shared);
3e170ce0
A
5809
5810 proc_list_lock();
5811 p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
5812
5813 if (ret == KERN_SUCCESS) {
5814 memorystatus_freeze_entry_t data = { aPid, TRUE, dirty };
5815
5816 memorystatus_frozen_count++;
5817
5818 p->p_memstat_state |= (P_MEMSTAT_FROZEN | (shared ? 0: P_MEMSTAT_NORECLAIM));
5819
39037602 5820 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
3e170ce0
A
5821 /* Update stats */
5822 for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
5823 throttle_intervals[i].pageouts += dirty;
5824 }
5825 }
5826
5827 memorystatus_freeze_pageouts += dirty;
5828 memorystatus_freeze_count++;
5829
5830 proc_list_unlock();
5831
5832 memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
5833 } else {
5834 proc_list_unlock();
5835 }
5836 }
5837
5838exit:
5839 lck_mtx_unlock(&freezer_mutex);
5840 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END,
5841 memorystatus_available_pages, aPid, 0, 0, 0);
5842
5843 return ret;
5844}
5845
316670eb 5846static int
39236c6e 5847memorystatus_freeze_top_process(boolean_t *memorystatus_freeze_swap_low)
6d2010ae 5848{
39236c6e
A
5849 pid_t aPid = 0;
5850 int ret = -1;
5851 proc_t p = PROC_NULL, next_p = PROC_NULL;
5852 unsigned int i = 0;
6d2010ae 5853
39236c6e
A
5854 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START,
5855 memorystatus_available_pages, 0, 0, 0, 0);
5856
5857 proc_list_lock();
6d2010ae 5858
39236c6e
A
5859 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5860 while (next_p) {
5861 kern_return_t kr;
5862 uint32_t purgeable, wired, clean, dirty;
5863 boolean_t shared;
5864 uint32_t pages;
5865 uint32_t max_pages = 0;
316670eb
A
5866 uint32_t state;
5867
39236c6e
A
5868 p = next_p;
5869 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
6d2010ae 5870
39236c6e
A
5871 aPid = p->p_pid;
5872 state = p->p_memstat_state;
6d2010ae 5873
316670eb 5874 /* Ensure the process is eligible for freezing */
39236c6e 5875 if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FROZEN)) || !(state & P_MEMSTAT_SUSPENDED)) {
316670eb
A
5876 continue; // with lock held
5877 }
316670eb 5878
39236c6e 5879 /* Only freeze processes meeting our minimum resident page criteria */
fe8ab488 5880 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
39236c6e
A
5881 if (pages < memorystatus_freeze_pages_min) {
5882 continue; // with lock held
5883 }
6d2010ae 5884
39037602 5885 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
3e170ce0
A
5886
5887 /* Ensure there's enough free space to freeze this process. */
5888
5889 unsigned int avail_swap_space = 0; /* in pages. */
5890
39037602
A
5891 /*
5892 * Freezer backed by the compressor and swap file(s)
5893 * while will hold compressed data.
5894 */
5895 avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64;
3e170ce0
A
5896
5897 max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max);
5898
316670eb
A
5899 if (max_pages < memorystatus_freeze_pages_min) {
5900 *memorystatus_freeze_swap_low = TRUE;
39236c6e
A
5901 proc_list_unlock();
5902 goto exit;
316670eb 5903 }
39236c6e 5904 } else {
3e170ce0
A
5905 /*
5906 * We only have the compressor pool.
5907 */
39236c6e
A
5908 max_pages = UINT32_MAX - 1;
5909 }
5910
5911 /* Mark as locked temporarily to avoid kill */
5912 p->p_memstat_state |= P_MEMSTAT_LOCKED;
5913
5914 p = proc_ref_locked(p);
5915 proc_list_unlock();
5916 if (!p) {
5917 goto exit;
5918 }
5919
5920 kr = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
5921
5922 MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_top_process: task_freeze %s for pid %d [%s] - "
39037602
A
5923 "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, max_pages %d, shared %d\n",
5924 (kr == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (*p->p_name ? p->p_name : "(unknown)"),
5925 memorystatus_available_pages, purgeable, wired, clean, dirty, max_pages, shared);
39236c6e
A
5926
5927 proc_list_lock();
5928 p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
5929
5930 /* Success? */
5931 if (KERN_SUCCESS == kr) {
5932 memorystatus_freeze_entry_t data = { aPid, TRUE, dirty };
316670eb 5933
39236c6e 5934 memorystatus_frozen_count++;
316670eb 5935
39236c6e 5936 p->p_memstat_state |= (P_MEMSTAT_FROZEN | (shared ? 0: P_MEMSTAT_NORECLAIM));
39037602
A
5937
5938 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
3e170ce0
A
5939 /* Update stats */
5940 for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
5941 throttle_intervals[i].pageouts += dirty;
5942 }
39236c6e 5943 }
3e170ce0 5944
39236c6e
A
5945 memorystatus_freeze_pageouts += dirty;
5946 memorystatus_freeze_count++;
5947
5948 proc_list_unlock();
6d2010ae 5949
39236c6e 5950 memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
6d2010ae 5951
3e170ce0
A
5952 /* Return KERN_SUCESS */
5953 ret = kr;
6d2010ae 5954
39236c6e
A
5955 } else {
5956 proc_list_unlock();
316670eb 5957 }
39236c6e
A
5958
5959 proc_rele(p);
5960 goto exit;
6d2010ae 5961 }
316670eb 5962
39236c6e
A
5963 proc_list_unlock();
5964
5965exit:
5966 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END,
5967 memorystatus_available_pages, aPid, 0, 0, 0);
316670eb 5968
39236c6e 5969 return ret;
6d2010ae
A
5970}
5971
316670eb
A
5972static inline boolean_t
5973memorystatus_can_freeze_processes(void)
6d2010ae 5974{
316670eb 5975 boolean_t ret;
6d2010ae 5976
39236c6e 5977 proc_list_lock();
316670eb
A
5978
5979 if (memorystatus_suspended_count) {
5980 uint32_t average_resident_pages, estimated_processes;
5981
5982 /* Estimate the number of suspended processes we can fit */
39236c6e 5983 average_resident_pages = memorystatus_suspended_footprint_total / memorystatus_suspended_count;
316670eb
A
5984 estimated_processes = memorystatus_suspended_count +
5985 ((memorystatus_available_pages - memorystatus_available_pages_critical) / average_resident_pages);
5986
5987 /* If it's predicted that no freeze will occur, lower the threshold temporarily */
5988 if (estimated_processes <= FREEZE_SUSPENDED_THRESHOLD_DEFAULT) {
5989 memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_LOW;
6d2010ae 5990 } else {
39236c6e 5991 memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT;
6d2010ae 5992 }
6d2010ae 5993
316670eb
A
5994 MEMORYSTATUS_DEBUG(1, "memorystatus_can_freeze_processes: %d suspended processes, %d average resident pages / process, %d suspended processes estimated\n",
5995 memorystatus_suspended_count, average_resident_pages, estimated_processes);
6d2010ae 5996
316670eb
A
5997 if ((memorystatus_suspended_count - memorystatus_frozen_count) > memorystatus_freeze_suspended_threshold) {
5998 ret = TRUE;
5999 } else {
6000 ret = FALSE;
6d2010ae 6001 }
316670eb
A
6002 } else {
6003 ret = FALSE;
6d2010ae 6004 }
316670eb 6005
39236c6e 6006 proc_list_unlock();
6d2010ae 6007
316670eb 6008 return ret;
6d2010ae
A
6009}
6010
316670eb
A
6011static boolean_t
6012memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low)
6d2010ae 6013{
3e170ce0
A
6014 boolean_t can_freeze = TRUE;
6015
316670eb
A
6016 /* Only freeze if we're sufficiently low on memory; this holds off freeze right
6017 after boot, and is generally is a no-op once we've reached steady state. */
6018 if (memorystatus_available_pages > memorystatus_freeze_threshold) {
6019 return FALSE;
6020 }
6021
6022 /* Check minimum suspended process threshold. */
6023 if (!memorystatus_can_freeze_processes()) {
6024 return FALSE;
6025 }
39037602 6026 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
6d2010ae 6027
39037602 6028 if ( !VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
3e170ce0
A
6029 /*
6030 * In-core compressor used for freezing WITHOUT on-disk swap support.
6031 */
3e170ce0
A
6032 if (vm_compressor_low_on_space()) {
6033 if (*memorystatus_freeze_swap_low) {
6034 *memorystatus_freeze_swap_low = TRUE;
6035 }
6036
6037 can_freeze = FALSE;
6038
6039 } else {
6040 if (*memorystatus_freeze_swap_low) {
6041 *memorystatus_freeze_swap_low = FALSE;
6042 }
6043
6044 can_freeze = TRUE;
6045 }
6046 } else {
6047 /*
6048 * Freezing WITH on-disk swap support.
39037602
A
6049 *
6050 * In-core compressor fronts the swap.
3e170ce0 6051 */
39037602
A
6052 if (vm_swap_low_on_space()) {
6053 if (*memorystatus_freeze_swap_low) {
6054 *memorystatus_freeze_swap_low = TRUE;
3e170ce0
A
6055 }
6056
39037602 6057 can_freeze = FALSE;
316670eb 6058 }
39037602 6059
6d2010ae
A
6060 }
6061
3e170ce0 6062 return can_freeze;
6d2010ae
A
6063}
6064
6065static void
316670eb 6066memorystatus_freeze_update_throttle_interval(mach_timespec_t *ts, struct throttle_interval_t *interval)
6d2010ae 6067{
3e170ce0 6068 unsigned int freeze_daily_pageouts_max = memorystatus_freeze_daily_mb_max * (1024 * 1024 / PAGE_SIZE);
6d2010ae
A
6069 if (CMP_MACH_TIMESPEC(ts, &interval->ts) >= 0) {
6070 if (!interval->max_pageouts) {
3e170ce0 6071 interval->max_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * freeze_daily_pageouts_max) / (24 * 60)));
6d2010ae 6072 } else {
316670eb 6073 printf("memorystatus_freeze_update_throttle_interval: %d minute throttle timeout, resetting\n", interval->mins);
6d2010ae
A
6074 }
6075 interval->ts.tv_sec = interval->mins * 60;
6076 interval->ts.tv_nsec = 0;
6077 ADD_MACH_TIMESPEC(&interval->ts, ts);
316670eb 6078 /* Since we update the throttle stats pre-freeze, adjust for overshoot here */
6d2010ae
A
6079 if (interval->pageouts > interval->max_pageouts) {
6080 interval->pageouts -= interval->max_pageouts;
6081 } else {
6082 interval->pageouts = 0;
6083 }
6084 interval->throttle = FALSE;
6085 } else if (!interval->throttle && interval->pageouts >= interval->max_pageouts) {
316670eb 6086 printf("memorystatus_freeze_update_throttle_interval: %d minute pageout limit exceeded; enabling throttle\n", interval->mins);
6d2010ae
A
6087 interval->throttle = TRUE;
6088 }
316670eb
A
6089
6090 MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_update_throttle_interval: throttle updated - %d frozen (%d max) within %dm; %dm remaining; throttle %s\n",
6d2010ae
A
6091 interval->pageouts, interval->max_pageouts, interval->mins, (interval->ts.tv_sec - ts->tv_sec) / 60,
6092 interval->throttle ? "on" : "off");
6d2010ae
A
6093}
6094
6095static boolean_t
316670eb 6096memorystatus_freeze_update_throttle(void)
6d2010ae
A
6097{
6098 clock_sec_t sec;
6099 clock_nsec_t nsec;
6100 mach_timespec_t ts;
6101 uint32_t i;
6102 boolean_t throttled = FALSE;
6103
6104#if DEVELOPMENT || DEBUG
316670eb 6105 if (!memorystatus_freeze_throttle_enabled)
6d2010ae
A
6106 return FALSE;
6107#endif
6108
6109 clock_get_system_nanotime(&sec, &nsec);
6110 ts.tv_sec = sec;
6111 ts.tv_nsec = nsec;
6112
316670eb 6113 /* Check freeze pageouts over multiple intervals and throttle if we've exceeded our budget.
6d2010ae 6114 *
316670eb 6115 * This ensures that periods of inactivity can't be used as 'credit' towards freeze if the device has
6d2010ae
A
6116 * remained dormant for a long period. We do, however, allow increased thresholds for shorter intervals in
6117 * order to allow for bursts of activity.
6118 */
6119 for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
316670eb 6120 memorystatus_freeze_update_throttle_interval(&ts, &throttle_intervals[i]);
6d2010ae
A
6121 if (throttle_intervals[i].throttle == TRUE)
6122 throttled = TRUE;
6123 }
6124
6125 return throttled;
6126}
6127
6128static void
316670eb 6129memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused)
6d2010ae 6130{
316670eb 6131 static boolean_t memorystatus_freeze_swap_low = FALSE;
3e170ce0
A
6132
6133 lck_mtx_lock(&freezer_mutex);
316670eb
A
6134 if (memorystatus_freeze_enabled) {
6135 if (memorystatus_can_freeze(&memorystatus_freeze_swap_low)) {
3e170ce0
A
6136 /* Only freeze if we've not exceeded our pageout budgets.*/
6137 if (!memorystatus_freeze_update_throttle()) {
39236c6e 6138 memorystatus_freeze_top_process(&memorystatus_freeze_swap_low);
316670eb
A
6139 } else {
6140 printf("memorystatus_freeze_thread: in throttle, ignoring freeze\n");
6141 memorystatus_freeze_throttle_count++; /* Throttled, update stats */
6142 }
6143 }
6144 }
3e170ce0 6145 lck_mtx_unlock(&freezer_mutex);
6d2010ae 6146
316670eb
A
6147 assert_wait((event_t) &memorystatus_freeze_wakeup, THREAD_UNINT);
6148 thread_block((thread_continue_t) memorystatus_freeze_thread);
6149}
6150
d190cdc3
A
6151static int
6152sysctl_memorystatus_do_fastwake_warmup_all SYSCTL_HANDLER_ARGS
6153{
6154#pragma unused(oidp, req, arg1, arg2)
6155
6156 /* Need to be root or have entitlement */
6157 if (!kauth_cred_issuser(kauth_cred_get()) && !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT)) {
6158 return EPERM;
6159 }
6160
6161 if (memorystatus_freeze_enabled == FALSE) {
6162 return ENOTSUP;
6163 }
6164
6165 do_fastwake_warmup_all();
6166
6167 return 0;
6168}
6169
6170SYSCTL_PROC(_kern, OID_AUTO, memorystatus_do_fastwake_warmup_all, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
6171 0, 0, &sysctl_memorystatus_do_fastwake_warmup_all, "I", "");
6172
316670eb 6173#endif /* CONFIG_FREEZE */
6d2010ae 6174
fe8ab488 6175#if VM_PRESSURE_EVENTS
6d2010ae 6176
fe8ab488 6177#if CONFIG_MEMORYSTATUS
316670eb 6178
fe8ab488
A
6179static int
6180memorystatus_send_note(int event_code, void *data, size_t data_length) {
6181 int ret;
6182 struct kev_msg ev_msg;
39037602 6183
fe8ab488
A
6184 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6185 ev_msg.kev_class = KEV_SYSTEM_CLASS;
6186 ev_msg.kev_subclass = KEV_MEMORYSTATUS_SUBCLASS;
6187
6188 ev_msg.event_code = event_code;
6189
6190 ev_msg.dv[0].data_length = data_length;
6191 ev_msg.dv[0].data_ptr = data;
6192 ev_msg.dv[1].data_length = 0;
6193
6194 ret = kev_post_msg(&ev_msg);
6195 if (ret) {
6196 printf("%s: kev_post_msg() failed, err %d\n", __func__, ret);
316670eb 6197 }
39236c6e 6198
fe8ab488 6199 return ret;
316670eb
A
6200}
6201
fe8ab488 6202boolean_t
813fb2f6 6203memorystatus_warn_process(pid_t pid, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t limit_exceeded) {
316670eb 6204
fe8ab488 6205 boolean_t ret = FALSE;
3e170ce0 6206 boolean_t found_knote = FALSE;
fe8ab488 6207 struct knote *kn = NULL;
813fb2f6 6208 int send_knote_count = 0;
316670eb 6209
fe8ab488
A
6210 /*
6211 * See comment in sysctl_memorystatus_vm_pressure_send.
6212 */
39236c6e 6213
fe8ab488 6214 memorystatus_klist_lock();
3e170ce0
A
6215
6216 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
39037602 6217 proc_t knote_proc = knote_get_kq(kn)->kq_p;
3e170ce0
A
6218 pid_t knote_pid = knote_proc->p_pid;
6219
6220 if (knote_pid == pid) {
6221 /*
6222 * By setting the "fflags" here, we are forcing
6223 * a process to deal with the case where it's
6224 * bumping up into its memory limits. If we don't
6225 * do this here, we will end up depending on the
6226 * system pressure snapshot evaluation in
6227 * filt_memorystatus().
6228 */
39037602 6229
5ba3f43e
A
6230#if CONFIG_EMBEDDED
6231 if (!limit_exceeded) {
6232 /*
6233 * Intentionally set either the unambiguous limit warning,
6234 * the system-wide critical or the system-wide warning
6235 * notification bit.
6236 */
6237
6238 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
6239 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
6240 found_knote = TRUE;
6241 send_knote_count++;
6242 } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
6243 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
6244 found_knote = TRUE;
6245 send_knote_count++;
6246 } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
6247 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
6248 found_knote = TRUE;
6249 send_knote_count++;
6250 }
6251 } else {
6252 /*
6253 * Send this notification when a process has exceeded a soft limit.
6254 */
6255 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
6256 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
6257 found_knote = TRUE;
6258 send_knote_count++;
6259 }
6260 }
6261#else /* CONFIG_EMBEDDED */
39037602
A
6262 if (!limit_exceeded) {
6263
6264 /*
6265 * Processes on desktop are not expecting to handle a system-wide
6266 * critical or system-wide warning notification from this path.
6267 * Intentionally set only the unambiguous limit warning here.
813fb2f6
A
6268 *
6269 * If the limit is soft, however, limit this to one notification per
6270 * active/inactive limit (per each registered listener).
39037602
A
6271 */
6272
6273 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
813fb2f6
A
6274 found_knote=TRUE;
6275 if (!is_fatal) {
6276 /*
6277 * Restrict proc_limit_warn notifications when
6278 * non-fatal (soft) limit is at play.
6279 */
6280 if (is_active) {
6281 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
6282 /*
6283 * Mark this knote for delivery.
6284 */
6285 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
6286 /*
6287 * And suppress it from future notifications.
6288 */
6289 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
6290 send_knote_count++;
6291 }
6292 } else {
6293 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
6294 /*
6295 * Mark this knote for delivery.
6296 */
6297 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
6298 /*
6299 * And suppress it from future notifications.
6300 */
6301 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
6302 send_knote_count++;
6303 }
6304 }
6305 } else {
6306 /*
6307 * No restriction on proc_limit_warn notifications when
6308 * fatal (hard) limit is at play.
6309 */
6310 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
6311 send_knote_count++;
6312 }
3e170ce0
A
6313 }
6314 } else {
39037602 6315 /*
813fb2f6 6316 * Send this notification when a process has exceeded a soft limit,
39037602 6317 */
813fb2f6 6318
39037602 6319 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
39037602 6320 found_knote = TRUE;
813fb2f6
A
6321 if (!is_fatal) {
6322 /*
6323 * Restrict critical notifications for soft limits.
6324 */
6325
6326 if (is_active) {
6327 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
6328 /*
6329 * Suppress future proc_limit_critical notifications
6330 * for the active soft limit.
6331 */
6332 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
6333 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
6334 send_knote_count++;
6335
6336 }
6337 } else {
6338 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
6339 /*
6340 * Suppress future proc_limit_critical_notifications
6341 * for the inactive soft limit.
6342 */
6343 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
6344 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
6345 send_knote_count++;
6346 }
6347 }
6348 } else {
6349 /*
6350 * We should never be trying to send a critical notification for
6351 * a hard limit... the process would be killed before it could be
6352 * received.
6353 */
6354 panic("Caught sending pid %d a critical warning for a fatal limit.\n", pid);
6355 }
3e170ce0
A
6356 }
6357 }
5ba3f43e 6358#endif /* CONFIG_EMBEDDED */
39236c6e 6359 }
3e170ce0
A
6360 }
6361
6362 if (found_knote) {
813fb2f6
A
6363 if (send_knote_count > 0) {
6364 KNOTE(&memorystatus_klist, 0);
6365 }
3e170ce0 6366 ret = TRUE;
6d2010ae 6367 }
3e170ce0 6368
fe8ab488 6369 memorystatus_klist_unlock();
6d2010ae 6370
fe8ab488 6371 return ret;
316670eb
A
6372}
6373
3e170ce0
A
6374/*
6375 * Can only be set by the current task on itself.
6376 */
6377int
6378memorystatus_low_mem_privileged_listener(uint32_t op_flags)
6379{
6380 boolean_t set_privilege = FALSE;
6381 /*
6382 * Need an entitlement check here?
6383 */
6384 if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE) {
6385 set_privilege = TRUE;
6386 } else if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE) {
6387 set_privilege = FALSE;
6388 } else {
6389 return EINVAL;
6390 }
6391
6392 return (task_low_mem_privileged_listener(current_task(), set_privilege, NULL));
6393}
6394
39236c6e 6395int
316670eb 6396memorystatus_send_pressure_note(pid_t pid) {
39236c6e
A
6397 MEMORYSTATUS_DEBUG(1, "memorystatus_send_pressure_note(): pid %d\n", pid);
6398 return memorystatus_send_note(kMemorystatusPressureNote, &pid, sizeof(pid));
6d2010ae
A
6399}
6400
fe8ab488
A
6401void
6402memorystatus_send_low_swap_note(void) {
6403
6404 struct knote *kn = NULL;
3e170ce0 6405
fe8ab488
A
6406 memorystatus_klist_lock();
6407 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
3e170ce0
A
6408 /* We call is_knote_registered_modify_task_pressure_bits to check if the sfflags for the
6409 * current note contain NOTE_MEMORYSTATUS_LOW_SWAP. Once we find one note in the memorystatus_klist
6410 * that has the NOTE_MEMORYSTATUS_LOW_SWAP flags in its sfflags set, we call KNOTE with
6411 * kMemoryStatusLowSwap as the hint to process and update all knotes on the memorystatus_klist accordingly. */
fe8ab488 6412 if (is_knote_registered_modify_task_pressure_bits(kn, NOTE_MEMORYSTATUS_LOW_SWAP, NULL, 0, 0) == TRUE) {
3e170ce0
A
6413 KNOTE(&memorystatus_klist, kMemorystatusLowSwap);
6414 break;
fe8ab488
A
6415 }
6416 }
3e170ce0 6417
fe8ab488
A
6418 memorystatus_klist_unlock();
6419}
6420
39236c6e
A
6421boolean_t
6422memorystatus_bg_pressure_eligible(proc_t p) {
6423 boolean_t eligible = FALSE;
6424
6425 proc_list_lock();
6426
6427 MEMORYSTATUS_DEBUG(1, "memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", p->p_pid, p->p_memstat_state);
6428
6429 /* Foreground processes have already been dealt with at this point, so just test for eligibility */
6430 if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) {
6431 eligible = TRUE;
6432 }
6433
6434 proc_list_unlock();
6435
6436 return eligible;
6437}
6438
6439boolean_t
6440memorystatus_is_foreground_locked(proc_t p) {
6441 return ((p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) ||
6442 (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT));
6443}
39037602
A
6444
6445/*
6446 * This is meant for stackshot and kperf -- it does not take the proc_list_lock
6447 * to access the p_memstat_dirty field.
6448 */
6449boolean_t
6450memorystatus_proc_is_dirty_unsafe(void *v)
6451{
6452 if (!v) {
6453 return FALSE;
6454 }
6455 proc_t p = (proc_t)v;
6456 return (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0;
6457}
6458
fe8ab488 6459#endif /* CONFIG_MEMORYSTATUS */
39236c6e
A
6460
6461/*
6462 * Trigger levels to test the mechanism.
6463 * Can be used via a sysctl.
6464 */
6465#define TEST_LOW_MEMORY_TRIGGER_ONE 1
6466#define TEST_LOW_MEMORY_TRIGGER_ALL 2
6467#define TEST_PURGEABLE_TRIGGER_ONE 3
6468#define TEST_PURGEABLE_TRIGGER_ALL 4
6469#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE 5
6470#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL 6
6471
6472boolean_t memorystatus_manual_testing_on = FALSE;
6473vm_pressure_level_t memorystatus_manual_testing_level = kVMPressureNormal;
6474
6475extern struct knote *
fe8ab488 6476vm_pressure_select_optimal_candidate_to_notify(struct klist *, int, boolean_t);
39236c6e 6477
39037602
A
6478/*
6479 * This value is the threshold that a process must meet to be considered for scavenging.
6480 */
5ba3f43e
A
6481#if CONFIG_EMBEDDED
6482#define VM_PRESSURE_MINIMUM_RSIZE 1 /* MB */
6483#else /* CONFIG_EMBEDDED */
39037602 6484#define VM_PRESSURE_MINIMUM_RSIZE 10 /* MB */
5ba3f43e 6485#endif /* CONFIG_EMBEDDED */
39236c6e 6486
39037602 6487#define VM_PRESSURE_NOTIFY_WAIT_PERIOD 10000 /* milliseconds */
39236c6e 6488
39037602
A
6489#if DEBUG
6490#define VM_PRESSURE_DEBUG(cond, format, ...) \
6491do { \
6492 if (cond) { printf(format, ##__VA_ARGS__); } \
6493} while(0)
6494#else
6495#define VM_PRESSURE_DEBUG(cond, format, ...)
6496#endif
39236c6e
A
6497
6498#define INTER_NOTIFICATION_DELAY (250000) /* .25 second */
6499
6500void memorystatus_on_pageout_scan_end(void) {
6501 /* No-op */
6502}
6503
6504/*
6505 * kn_max - knote
6506 *
6507 * knote_pressure_level - to check if the knote is registered for this notification level.
6508 *
6509 * task - task whose bits we'll be modifying
6510 *
6511 * pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again.
6512 *
6513 * pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately.
6514 *
6515 */
39236c6e 6516
39037602
A
6517boolean_t
6518is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set)
6519{
6520 if (kn_max->kn_sfflags & knote_pressure_level) {
6521
6522 if (pressure_level_to_clear && task_has_been_notified(task, pressure_level_to_clear) == TRUE) {
6523
6524 task_clear_has_been_notified(task, pressure_level_to_clear);
6525 }
6526
6527 task_mark_has_been_notified(task, pressure_level_to_set);
6528 return TRUE;
6529 }
6530
6531 return FALSE;
6532}
6533
6534void
6535memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear)
6536{
6537 struct knote *kn = NULL;
6538
6539 memorystatus_klist_lock();
6540 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
6541
6542 proc_t p = PROC_NULL;
6543 struct task* t = TASK_NULL;
6544
6545 p = knote_get_kq(kn)->kq_p;
6546 proc_list_lock();
6547 if (p != proc_ref_locked(p)) {
6548 p = PROC_NULL;
6549 proc_list_unlock();
6550 continue;
6551 }
6552 proc_list_unlock();
6553
6554 t = (struct task *)(p->task);
6555
6556 task_clear_has_been_notified(t, pressure_level_to_clear);
6557
6558 proc_rele(p);
6559 }
6560
6561 memorystatus_klist_unlock();
6562}
6563
6564extern kern_return_t vm_pressure_notify_dispatch_vm_clients(boolean_t target_foreground_process);
6565
6566struct knote *
6567vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process);
6568
6569/*
6570 * Used by the vm_pressure_thread which is
6571 * signalled from within vm_pageout_scan().
6572 */
6573static void vm_dispatch_memory_pressure(void);
6574void consider_vm_pressure_events(void);
6575
6576void consider_vm_pressure_events(void)
6577{
6578 vm_dispatch_memory_pressure();
6579}
6580static void vm_dispatch_memory_pressure(void)
6581{
6582 memorystatus_update_vm_pressure(FALSE);
6583}
6584
6585extern vm_pressure_level_t
6586convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
6587
6588struct knote *
6589vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process)
6590{
6591 struct knote *kn = NULL, *kn_max = NULL;
6592 uint64_t resident_max = 0; /* MB */
6593 struct timeval curr_tstamp = {0, 0};
6594 int elapsed_msecs = 0;
6595 int selected_task_importance = 0;
6596 static int pressure_snapshot = -1;
6597 boolean_t pressure_increase = FALSE;
6598
6599 if (pressure_snapshot == -1) {
6600 /*
6601 * Initial snapshot.
6602 */
6603 pressure_snapshot = level;
6604 pressure_increase = TRUE;
6605 } else {
6606
5ba3f43e 6607 if (level && (level >= pressure_snapshot)) {
39037602
A
6608 pressure_increase = TRUE;
6609 } else {
6610 pressure_increase = FALSE;
6611 }
6612
6613 pressure_snapshot = level;
6614 }
6615
6616 if (pressure_increase == TRUE) {
6617 /*
6618 * We'll start by considering the largest
6619 * unimportant task in our list.
6620 */
6621 selected_task_importance = INT_MAX;
6622 } else {
6623 /*
6624 * We'll start by considering the largest
6625 * important task in our list.
6626 */
6627 selected_task_importance = 0;
6628 }
6629
6630 microuptime(&curr_tstamp);
6631
6632 SLIST_FOREACH(kn, candidate_list, kn_selnext) {
6633
6634 uint64_t resident_size = 0; /* MB */
6635 proc_t p = PROC_NULL;
6636 struct task* t = TASK_NULL;
6637 int curr_task_importance = 0;
6638 boolean_t consider_knote = FALSE;
6639 boolean_t privileged_listener = FALSE;
6640
6641 p = knote_get_kq(kn)->kq_p;
6642 proc_list_lock();
6643 if (p != proc_ref_locked(p)) {
6644 p = PROC_NULL;
6645 proc_list_unlock();
6646 continue;
6647 }
6648 proc_list_unlock();
6649
6650#if CONFIG_MEMORYSTATUS
6651 if (target_foreground_process == TRUE && !memorystatus_is_foreground_locked(p)) {
6652 /*
6653 * Skip process not marked foreground.
6654 */
6655 proc_rele(p);
6656 continue;
6657 }
6658#endif /* CONFIG_MEMORYSTATUS */
6659
6660 t = (struct task *)(p->task);
6661
6662 timevalsub(&curr_tstamp, &p->vm_pressure_last_notify_tstamp);
6663 elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
6664
6665 vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(level);
6666
6667 if ((kn->kn_sfflags & dispatch_level) == 0) {
6668 proc_rele(p);
6669 continue;
6670 }
6671
6672#if CONFIG_MEMORYSTATUS
6673 if (target_foreground_process == FALSE && !memorystatus_bg_pressure_eligible(p)) {
6674 VM_PRESSURE_DEBUG(1, "[vm_pressure] skipping process %d\n", p->p_pid);
6675 proc_rele(p);
6676 continue;
6677 }
6678#endif /* CONFIG_MEMORYSTATUS */
6679
5ba3f43e
A
6680#if CONFIG_EMBEDDED
6681 curr_task_importance = p->p_memstat_effectivepriority;
6682#else /* CONFIG_EMBEDDED */
39037602 6683 curr_task_importance = task_importance_estimate(t);
5ba3f43e 6684#endif /* CONFIG_EMBEDDED */
39037602
A
6685
6686 /*
6687 * Privileged listeners are only considered in the multi-level pressure scheme
6688 * AND only if the pressure is increasing.
6689 */
6690 if (level > 0) {
6691
6692 if (task_has_been_notified(t, level) == FALSE) {
6693
6694 /*
6695 * Is this a privileged listener?
6696 */
6697 if (task_low_mem_privileged_listener(t, FALSE, &privileged_listener) == 0) {
6698
6699 if (privileged_listener) {
6700 kn_max = kn;
6701 proc_rele(p);
6702 goto done_scanning;
6703 }
6704 }
6705 } else {
6706 proc_rele(p);
6707 continue;
6708 }
6709 } else if (level == 0) {
6710
6711 /*
6712 * Task wasn't notified when the pressure was increasing and so
6713 * no need to notify it that the pressure is decreasing.
6714 */
6715 if ((task_has_been_notified(t, kVMPressureWarning) == FALSE) && (task_has_been_notified(t, kVMPressureCritical) == FALSE)) {
6716 proc_rele(p);
6717 continue;
6718 }
6719 }
6720
6721 /*
6722 * We don't want a small process to block large processes from
6723 * being notified again. <rdar://problem/7955532>
6724 */
6725 resident_size = (get_task_phys_footprint(t))/(1024*1024ULL); /* MB */
6726
6727 if (resident_size >= VM_PRESSURE_MINIMUM_RSIZE) {
6728
6729 if (level > 0) {
6730 /*
6731 * Warning or Critical Pressure.
6732 */
6733 if (pressure_increase) {
6734 if ((curr_task_importance < selected_task_importance) ||
6735 ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
6736
6737 /*
6738 * We have found a candidate process which is:
6739 * a) at a lower importance than the current selected process
6740 * OR
6741 * b) has importance equal to that of the current selected process but is larger
6742 */
39236c6e 6743
39037602
A
6744 consider_knote = TRUE;
6745 }
6746 } else {
6747 if ((curr_task_importance > selected_task_importance) ||
6748 ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
39236c6e 6749
39037602
A
6750 /*
6751 * We have found a candidate process which is:
6752 * a) at a higher importance than the current selected process
6753 * OR
6754 * b) has importance equal to that of the current selected process but is larger
6755 */
39236c6e 6756
39037602
A
6757 consider_knote = TRUE;
6758 }
6759 }
6760 } else if (level == 0) {
6761 /*
6762 * Pressure back to normal.
6763 */
6764 if ((curr_task_importance > selected_task_importance) ||
6765 ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
6766
6767 consider_knote = TRUE;
6768 }
6769 }
6770
6771 if (consider_knote) {
6772 resident_max = resident_size;
6773 kn_max = kn;
6774 selected_task_importance = curr_task_importance;
6775 consider_knote = FALSE; /* reset for the next candidate */
6776 }
6777 } else {
6778 /* There was no candidate with enough resident memory to scavenge */
6779 VM_PRESSURE_DEBUG(0, "[vm_pressure] threshold failed for pid %d with %llu resident...\n", p->p_pid, resident_size);
6780 }
6781 proc_rele(p);
6782 }
6783
6784done_scanning:
6785 if (kn_max) {
6786 VM_DEBUG_CONSTANT_EVENT(vm_pressure_event, VM_PRESSURE_EVENT, DBG_FUNC_NONE, knote_get_kq(kn_max)->kq_p->p_pid, resident_max, 0, 0);
6787 VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %llu resident\n", knote_get_kq(kn_max)->kq_p->p_pid, resident_max);
39236c6e
A
6788 }
6789
39037602 6790 return kn_max;
39236c6e
A
6791}
6792
fe8ab488 6793#define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD 5000 /* milliseconds */
39037602
A
6794#define WARNING_NOTIFICATION_RESTING_PERIOD 25 /* seconds */
6795#define CRITICAL_NOTIFICATION_RESTING_PERIOD 25 /* seconds */
6796
6797uint64_t next_warning_notification_sent_at_ts = 0;
6798uint64_t next_critical_notification_sent_at_ts = 0;
39236c6e
A
6799
6800kern_return_t
fe8ab488 6801memorystatus_update_vm_pressure(boolean_t target_foreground_process)
39236c6e
A
6802{
6803 struct knote *kn_max = NULL;
3e170ce0 6804 struct knote *kn_cur = NULL, *kn_temp = NULL; /* for safe list traversal */
39236c6e
A
6805 pid_t target_pid = -1;
6806 struct klist dispatch_klist = { NULL };
6807 proc_t target_proc = PROC_NULL;
39236c6e
A
6808 struct task *task = NULL;
6809 boolean_t found_candidate = FALSE;
6810
fe8ab488
A
6811 static vm_pressure_level_t level_snapshot = kVMPressureNormal;
6812 static vm_pressure_level_t prev_level_snapshot = kVMPressureNormal;
6813 boolean_t smoothing_window_started = FALSE;
6814 struct timeval smoothing_window_start_tstamp = {0, 0};
6815 struct timeval curr_tstamp = {0, 0};
6816 int elapsed_msecs = 0;
39037602 6817 uint64_t curr_ts = mach_absolute_time();
fe8ab488
A
6818
6819#if !CONFIG_JETSAM
6820#define MAX_IDLE_KILLS 100 /* limit the number of idle kills allowed */
6821
6822 int idle_kill_counter = 0;
6823
6824 /*
6825 * On desktop we take this opportunity to free up memory pressure
6826 * by immediately killing idle exitable processes. We use a delay
6827 * to avoid overkill. And we impose a max counter as a fail safe
6828 * in case daemons re-launch too fast.
6829 */
6830 while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) {
6831 if (memorystatus_idle_exit_from_VM() == FALSE) {
6832 /* No idle exitable processes left to kill */
6833 break;
6834 }
6835 idle_kill_counter++;
3e170ce0
A
6836
6837 if (memorystatus_manual_testing_on == TRUE) {
6838 /*
6839 * Skip the delay when testing
6840 * the pressure notification scheme.
6841 */
6842 } else {
6843 delay(1000000); /* 1 second */
6844 }
fe8ab488
A
6845 }
6846#endif /* !CONFIG_JETSAM */
6847
39037602
A
6848 if (level_snapshot != kVMPressureNormal) {
6849
6850 /*
6851 * Check to see if we are still in the 'resting' period
6852 * after having notified all clients interested in
6853 * a particular pressure level.
6854 */
6855
6856 level_snapshot = memorystatus_vm_pressure_level;
6857
6858 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
6859
5ba3f43e
A
6860 if (next_warning_notification_sent_at_ts) {
6861 if (curr_ts < next_warning_notification_sent_at_ts) {
6862 delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
6863 return KERN_SUCCESS;
6864 }
6865
6866 next_warning_notification_sent_at_ts = 0;
6867 memorystatus_klist_reset_all_for_level(kVMPressureWarning);
39037602
A
6868 }
6869 } else if (level_snapshot == kVMPressureCritical) {
6870
5ba3f43e
A
6871 if (next_critical_notification_sent_at_ts) {
6872 if (curr_ts < next_critical_notification_sent_at_ts) {
6873 delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
6874 return KERN_SUCCESS;
6875 }
6876 next_critical_notification_sent_at_ts = 0;
6877 memorystatus_klist_reset_all_for_level(kVMPressureCritical);
39037602
A
6878 }
6879 }
6880 }
6881
39236c6e
A
6882 while (1) {
6883
6884 /*
6885 * There is a race window here. But it's not clear
6886 * how much we benefit from having extra synchronization.
6887 */
6888 level_snapshot = memorystatus_vm_pressure_level;
6889
fe8ab488
A
6890 if (prev_level_snapshot > level_snapshot) {
6891 /*
6892 * Pressure decreased? Let's take a little breather
6893 * and see if this condition stays.
6894 */
6895 if (smoothing_window_started == FALSE) {
6896
6897 smoothing_window_started = TRUE;
6898 microuptime(&smoothing_window_start_tstamp);
6899 }
6900
6901 microuptime(&curr_tstamp);
6902 timevalsub(&curr_tstamp, &smoothing_window_start_tstamp);
6903 elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
6904
6905 if (elapsed_msecs < VM_PRESSURE_DECREASED_SMOOTHING_PERIOD) {
6906
6907 delay(INTER_NOTIFICATION_DELAY);
6908 continue;
6909 }
6910 }
6911
6912 prev_level_snapshot = level_snapshot;
6913 smoothing_window_started = FALSE;
6914
39236c6e 6915 memorystatus_klist_lock();
fe8ab488 6916 kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot, target_foreground_process);
39236c6e
A
6917
6918 if (kn_max == NULL) {
6919 memorystatus_klist_unlock();
6920
6921 /*
6922 * No more level-based clients to notify.
39236c6e 6923 *
39037602 6924 * Start the 'resting' window within which clients will not be re-notified.
39236c6e
A
6925 */
6926
6927 if (level_snapshot != kVMPressureNormal) {
39037602
A
6928 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
6929 nanoseconds_to_absolutetime(WARNING_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
39037602 6930
5ba3f43e
A
6931 /* Next warning notification (if nothing changes) won't be sent before...*/
6932 next_warning_notification_sent_at_ts = mach_absolute_time() + curr_ts;
39037602
A
6933 }
6934
6935 if (level_snapshot == kVMPressureCritical) {
6936 nanoseconds_to_absolutetime(CRITICAL_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
39037602 6937
5ba3f43e
A
6938 /* Next critical notification (if nothing changes) won't be sent before...*/
6939 next_critical_notification_sent_at_ts = mach_absolute_time() + curr_ts;
39037602
A
6940 }
6941 }
6942 return KERN_FAILURE;
39236c6e
A
6943 }
6944
39037602 6945 target_proc = knote_get_kq(kn_max)->kq_p;
39236c6e
A
6946
6947 proc_list_lock();
6948 if (target_proc != proc_ref_locked(target_proc)) {
6949 target_proc = PROC_NULL;
6950 proc_list_unlock();
6951 memorystatus_klist_unlock();
6952 continue;
6953 }
6954 proc_list_unlock();
39236c6e
A
6955
6956 target_pid = target_proc->p_pid;
6957
6958 task = (struct task *)(target_proc->task);
6959
6960 if (level_snapshot != kVMPressureNormal) {
6961
6962 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
6963
39037602 6964 if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, 0, kVMPressureWarning) == TRUE) {
39236c6e
A
6965 found_candidate = TRUE;
6966 }
6967 } else {
6968 if (level_snapshot == kVMPressureCritical) {
6969
39037602 6970 if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, 0, kVMPressureCritical) == TRUE) {
39236c6e
A
6971 found_candidate = TRUE;
6972 }
6973 }
6974 }
6975 } else {
6976 if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
6977
6978 task_clear_has_been_notified(task, kVMPressureWarning);
6979 task_clear_has_been_notified(task, kVMPressureCritical);
6980
6981 found_candidate = TRUE;
6d2010ae
A
6982 }
6983 }
39236c6e
A
6984
6985 if (found_candidate == FALSE) {
3e170ce0
A
6986 proc_rele(target_proc);
6987 memorystatus_klist_unlock();
39236c6e
A
6988 continue;
6989 }
6990
3e170ce0 6991 SLIST_FOREACH_SAFE(kn_cur, &memorystatus_klist, kn_selnext, kn_temp) {
39037602
A
6992
6993 int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level_snapshot);
6994
6995 if (is_knote_registered_modify_task_pressure_bits(kn_cur, knote_pressure_level, task, 0, level_snapshot) == TRUE) {
6996 proc_t knote_proc = knote_get_kq(kn_cur)->kq_p;
6997 pid_t knote_pid = knote_proc->p_pid;
6998 if (knote_pid == target_pid) {
6999 KNOTE_DETACH(&memorystatus_klist, kn_cur);
7000 KNOTE_ATTACH(&dispatch_klist, kn_cur);
7001 }
3e170ce0
A
7002 }
7003 }
39236c6e
A
7004
7005 KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure);
7006
3e170ce0
A
7007 SLIST_FOREACH_SAFE(kn_cur, &dispatch_klist, kn_selnext, kn_temp) {
7008 KNOTE_DETACH(&dispatch_klist, kn_cur);
7009 KNOTE_ATTACH(&memorystatus_klist, kn_cur);
7010 }
7011
39236c6e
A
7012 memorystatus_klist_unlock();
7013
7014 microuptime(&target_proc->vm_pressure_last_notify_tstamp);
7015 proc_rele(target_proc);
7016
fe8ab488 7017 if (memorystatus_manual_testing_on == TRUE && target_foreground_process == TRUE) {
39236c6e
A
7018 break;
7019 }
7020
fe8ab488
A
7021 if (memorystatus_manual_testing_on == TRUE) {
7022 /*
7023 * Testing out the pressure notification scheme.
7024 * No need for delays etc.
7025 */
7026 } else {
7027
7028 uint32_t sleep_interval = INTER_NOTIFICATION_DELAY;
7029#if CONFIG_JETSAM
7030 unsigned int page_delta = 0;
7031 unsigned int skip_delay_page_threshold = 0;
7032
7033 assert(memorystatus_available_pages_pressure >= memorystatus_available_pages_critical_base);
7034
7035 page_delta = (memorystatus_available_pages_pressure - memorystatus_available_pages_critical_base) / 2;
7036 skip_delay_page_threshold = memorystatus_available_pages_pressure - page_delta;
7037
7038 if (memorystatus_available_pages <= skip_delay_page_threshold) {
7039 /*
7040 * We are nearing the critcal mark fast and can't afford to wait between
7041 * notifications.
7042 */
7043 sleep_interval = 0;
7044 }
7045#endif /* CONFIG_JETSAM */
7046
7047 if (sleep_interval) {
7048 delay(sleep_interval);
7049 }
39236c6e 7050 }
6d2010ae 7051 }
39236c6e
A
7052
7053 return KERN_SUCCESS;
6d2010ae
A
7054}
7055
39236c6e
A
7056vm_pressure_level_t
7057convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)
7058{
7059 vm_pressure_level_t dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
7060
7061 switch (internal_pressure_level) {
7062
7063 case kVMPressureNormal:
7064 {
7065 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
7066 break;
7067 }
7068
7069 case kVMPressureWarning:
7070 case kVMPressureUrgent:
7071 {
7072 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN;
7073 break;
7074 }
7075
7076 case kVMPressureCritical:
7077 {
7078 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
7079 break;
7080 }
7081
7082 default:
7083 break;
7084 }
316670eb 7085
39236c6e
A
7086 return dispatch_level;
7087}
6d2010ae 7088
b0d623f7 7089static int
39236c6e 7090sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS
b0d623f7 7091{
39236c6e 7092#pragma unused(arg1, arg2, oidp)
5ba3f43e
A
7093#if CONFIG_EMBEDDED
7094 int error = 0;
7095
7096 error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
7097 if (error)
7098 return (error);
7099
7100#endif /* CONFIG_EMBEDDED */
39236c6e
A
7101 vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(memorystatus_vm_pressure_level);
7102
7103 return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level));
7104}
7105
fe8ab488
A
7106#if DEBUG || DEVELOPMENT
7107
39236c6e
A
7108SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_LOCKED,
7109 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
7110
fe8ab488
A
7111#else /* DEBUG || DEVELOPMENT */
7112
7113SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_LOCKED|CTLFLAG_MASKED,
7114 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
7115
7116#endif /* DEBUG || DEVELOPMENT */
b0d623f7 7117
39236c6e
A
7118extern int memorystatus_purge_on_warning;
7119extern int memorystatus_purge_on_critical;
7120
7121static int
7122sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
7123{
7124#pragma unused(arg1, arg2)
b0d623f7 7125
39236c6e
A
7126 int level = 0;
7127 int error = 0;
7128 int pressure_level = 0;
7129 int trigger_request = 0;
7130 int force_purge;
7131
7132 error = sysctl_handle_int(oidp, &level, 0, req);
7133 if (error || !req->newptr) {
7134 return (error);
7135 }
7136
7137 memorystatus_manual_testing_on = TRUE;
7138
7139 trigger_request = (level >> 16) & 0xFFFF;
7140 pressure_level = (level & 0xFFFF);
7141
7142 if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE ||
7143 trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) {
7144 return EINVAL;
7145 }
7146 switch (pressure_level) {
7147 case NOTE_MEMORYSTATUS_PRESSURE_NORMAL:
7148 case NOTE_MEMORYSTATUS_PRESSURE_WARN:
7149 case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL:
7150 break;
7151 default:
b0d623f7
A
7152 return EINVAL;
7153 }
b0d623f7 7154
39236c6e
A
7155 /*
7156 * The pressure level is being set from user-space.
7157 * And user-space uses the constants in sys/event.h
7158 * So we translate those events to our internal levels here.
7159 */
7160 if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
7161
7162 memorystatus_manual_testing_level = kVMPressureNormal;
7163 force_purge = 0;
7164
7165 } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) {
7166
7167 memorystatus_manual_testing_level = kVMPressureWarning;
7168 force_purge = memorystatus_purge_on_warning;
7169
7170 } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
7171
7172 memorystatus_manual_testing_level = kVMPressureCritical;
7173 force_purge = memorystatus_purge_on_critical;
b0d623f7
A
7174 }
7175
39236c6e 7176 memorystatus_vm_pressure_level = memorystatus_manual_testing_level;
316670eb 7177
39236c6e
A
7178 /* purge according to the new pressure level */
7179 switch (trigger_request) {
7180 case TEST_PURGEABLE_TRIGGER_ONE:
7181 case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE:
7182 if (force_purge == 0) {
7183 /* no purging requested */
7184 break;
7185 }
7186 vm_purgeable_object_purge_one_unlocked(force_purge);
7187 break;
7188 case TEST_PURGEABLE_TRIGGER_ALL:
7189 case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL:
7190 if (force_purge == 0) {
7191 /* no purging requested */
7192 break;
7193 }
7194 while (vm_purgeable_object_purge_one_unlocked(force_purge));
7195 break;
7196 }
7197
7198 if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) ||
7199 (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) {
7200
7201 memorystatus_update_vm_pressure(TRUE);
7202 }
7203
7204 if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) ||
7205 (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) {
7206
7207 while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) {
7208 continue;
7209 }
7210 }
7211
7212 if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
7213 memorystatus_manual_testing_on = FALSE;
39236c6e
A
7214 }
7215
7216 return 0;
b0d623f7
A
7217}
7218
39236c6e
A
7219SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
7220 0, 0, &sysctl_memorypressure_manual_trigger, "I", "");
7221
7222
7223extern int memorystatus_purge_on_warning;
7224extern int memorystatus_purge_on_urgent;
7225extern int memorystatus_purge_on_critical;
7226
7227SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_warning, 0, "");
7228SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_urgent, 0, "");
7229SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_critical, 0, "");
7230
7231
fe8ab488 7232#endif /* VM_PRESSURE_EVENTS */
39236c6e
A
7233
7234/* Return both allocated and actual size, since there's a race between allocation and list compilation */
b0d623f7 7235static int
39236c6e 7236memorystatus_get_priority_list(memorystatus_priority_entry_t **list_ptr, size_t *buffer_size, size_t *list_size, boolean_t size_only)
b0d623f7 7237{
316670eb 7238 uint32_t list_count, i = 0;
39236c6e
A
7239 memorystatus_priority_entry_t *list_entry;
7240 proc_t p;
7241
316670eb 7242 list_count = memorystatus_list_count;
39236c6e
A
7243 *list_size = sizeof(memorystatus_priority_entry_t) * list_count;
7244
7245 /* Just a size check? */
7246 if (size_only) {
7247 return 0;
7248 }
7249
7250 /* Otherwise, validate the size of the buffer */
7251 if (*buffer_size < *list_size) {
7252 return EINVAL;
7253 }
7254
7255 *list_ptr = (memorystatus_priority_entry_t*)kalloc(*list_size);
7256 if (!list_ptr) {
316670eb
A
7257 return ENOMEM;
7258 }
7259
39236c6e
A
7260 memset(*list_ptr, 0, *list_size);
7261
7262 *buffer_size = *list_size;
7263 *list_size = 0;
7264
7265 list_entry = *list_ptr;
7266
7267 proc_list_lock();
7268
7269 p = memorystatus_get_first_proc_locked(&i, TRUE);
7270 while (p && (*list_size < *buffer_size)) {
7271 list_entry->pid = p->p_pid;
7272 list_entry->priority = p->p_memstat_effectivepriority;
7273 list_entry->user_data = p->p_memstat_userdata;
3e170ce0 7274
3e170ce0
A
7275 if (p->p_memstat_memlimit <= 0) {
7276 task_get_phys_footprint_limit(p->task, &list_entry->limit);
7277 } else {
7278 list_entry->limit = p->p_memstat_memlimit;
7279 }
39037602 7280
39236c6e
A
7281 list_entry->state = memorystatus_build_state(p);
7282 list_entry++;
7283
7284 *list_size += sizeof(memorystatus_priority_entry_t);
7285
7286 p = memorystatus_get_next_proc_locked(&i, p, TRUE);
316670eb
A
7287 }
7288
39236c6e 7289 proc_list_unlock();
316670eb 7290
39236c6e 7291 MEMORYSTATUS_DEBUG(1, "memorystatus_get_priority_list: returning %lu for size\n", (unsigned long)*list_size);
316670eb 7292
39236c6e
A
7293 return 0;
7294}
b0d623f7 7295
39236c6e 7296static int
5ba3f43e
A
7297memorystatus_get_priority_pid(pid_t pid, user_addr_t buffer, size_t buffer_size) {
7298 int error = 0;
7299 memorystatus_priority_entry_t mp_entry;
7300
7301 /* Validate inputs */
7302 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_priority_entry_t))) {
7303 return EINVAL;
7304 }
7305
7306 proc_t p = proc_find(pid);
7307 if (!p) {
7308 return ESRCH;
7309 }
7310
7311 memset (&mp_entry, 0, sizeof(memorystatus_priority_entry_t));
7312
7313 mp_entry.pid = p->p_pid;
7314 mp_entry.priority = p->p_memstat_effectivepriority;
7315 mp_entry.user_data = p->p_memstat_userdata;
7316 if (p->p_memstat_memlimit <= 0) {
7317 task_get_phys_footprint_limit(p->task, &mp_entry.limit);
7318 } else {
7319 mp_entry.limit = p->p_memstat_memlimit;
7320 }
7321 mp_entry.state = memorystatus_build_state(p);
7322
7323 proc_rele(p);
7324
7325 error = copyout(&mp_entry, buffer, buffer_size);
7326
7327 return (error);
7328}
7329
7330static int
7331memorystatus_cmd_get_priority_list(pid_t pid, user_addr_t buffer, size_t buffer_size, int32_t *retval) {
7332 int error = 0;
39236c6e 7333 boolean_t size_only;
39236c6e 7334 size_t list_size;
5ba3f43e
A
7335
7336 /*
7337 * When a non-zero pid is provided, the 'list' has only one entry.
7338 */
316670eb 7339
39236c6e 7340 size_only = ((buffer == USER_ADDR_NULL) ? TRUE: FALSE);
39236c6e 7341
5ba3f43e
A
7342 if (pid != 0) {
7343 list_size = sizeof(memorystatus_priority_entry_t) * 1;
7344 if (!size_only) {
7345 error = memorystatus_get_priority_pid(pid, buffer, buffer_size);
7346 }
7347 } else {
7348 memorystatus_priority_entry_t *list = NULL;
7349 error = memorystatus_get_priority_list(&list, &buffer_size, &list_size, size_only);
7350
7351 if (error == 0) {
7352 if (!size_only) {
7353 error = copyout(list, buffer, list_size);
7354 }
7355 }
7356
7357 if (list) {
7358 kfree(list, buffer_size);
7359 }
39236c6e 7360 }
5ba3f43e 7361
39236c6e
A
7362 if (error == 0) {
7363 *retval = list_size;
7364 }
39236c6e 7365
5ba3f43e 7366 return (error);
316670eb 7367}
b0d623f7 7368
39236c6e
A
7369static void
7370memorystatus_clear_errors(void)
7371{
7372 proc_t p;
7373 unsigned int i = 0;
7374
7375 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_START, 0, 0, 0, 0, 0);
7376
7377 proc_list_lock();
7378
7379 p = memorystatus_get_first_proc_locked(&i, TRUE);
7380 while (p) {
7381 if (p->p_memstat_state & P_MEMSTAT_ERROR) {
7382 p->p_memstat_state &= ~P_MEMSTAT_ERROR;
7383 }
7384 p = memorystatus_get_next_proc_locked(&i, p, TRUE);
7385 }
7386
7387 proc_list_unlock();
7388
7389 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_END, 0, 0, 0, 0, 0);
7390}
b0d623f7 7391
5ba3f43e 7392#if CONFIG_JETSAM
316670eb 7393static void
39236c6e 7394memorystatus_update_levels_locked(boolean_t critical_only) {
fe8ab488 7395
39236c6e 7396 memorystatus_available_pages_critical = memorystatus_available_pages_critical_base;
fe8ab488
A
7397
7398 /*
7399 * If there's an entry in the first bucket, we have idle processes.
7400 */
39037602 7401
fe8ab488
A
7402 memstat_bucket_t *first_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
7403 if (first_bucket->count) {
7404 memorystatus_available_pages_critical += memorystatus_available_pages_critical_idle_offset;
7405
7406 if (memorystatus_available_pages_critical > memorystatus_available_pages_pressure ) {
7407 /*
7408 * The critical threshold must never exceed the pressure threshold
7409 */
7410 memorystatus_available_pages_critical = memorystatus_available_pages_pressure;
39236c6e
A
7411 }
7412 }
fe8ab488 7413
316670eb
A
7414#if DEBUG || DEVELOPMENT
7415 if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
7416 memorystatus_available_pages_critical += memorystatus_jetsam_policy_offset_pages_diagnostic;
fe8ab488
A
7417
7418 if (memorystatus_available_pages_critical > memorystatus_available_pages_pressure ) {
7419 /*
7420 * The critical threshold must never exceed the pressure threshold
7421 */
7422 memorystatus_available_pages_critical = memorystatus_available_pages_pressure;
7423 }
39236c6e 7424 }
5ba3f43e 7425#endif /* DEBUG || DEVELOPMENT */
39037602
A
7426
7427 if (memorystatus_jetsam_policy & kPolicyMoreFree) {
7428 memorystatus_available_pages_critical += memorystatus_policy_more_free_offset_pages;
7429 }
7430
39236c6e
A
7431 if (critical_only) {
7432 return;
7433 }
7434
316670eb 7435#if VM_PRESSURE_EVENTS
39236c6e
A
7436 memorystatus_available_pages_pressure = (pressure_threshold_percentage / delta_percentage) * memorystatus_delta;
7437#if DEBUG || DEVELOPMENT
7438 if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
316670eb 7439 memorystatus_available_pages_pressure += memorystatus_jetsam_policy_offset_pages_diagnostic;
316670eb
A
7440 }
7441#endif
39236c6e
A
7442#endif
7443}
7444
5ba3f43e 7445
39037602
A
7446static int
7447sysctl_kern_memorystatus_policy_more_free SYSCTL_HANDLER_ARGS
7448{
7449#pragma unused(arg1, arg2, oidp)
7450 int error = 0, more_free = 0;
7451
7452 /*
7453 * TODO: Enable this privilege check?
7454 *
7455 * error = priv_check_cred(kauth_cred_get(), PRIV_VM_JETSAM, 0);
7456 * if (error)
7457 * return (error);
7458 */
7459
7460 error = sysctl_handle_int(oidp, &more_free, 0, req);
7461 if (error || !req->newptr)
7462 return (error);
7463
7464 if ((more_free && ((memorystatus_jetsam_policy & kPolicyMoreFree) == kPolicyMoreFree)) ||
7465 (!more_free && ((memorystatus_jetsam_policy & kPolicyMoreFree) == 0))) {
7466
7467 /*
7468 * No change in state.
7469 */
7470 return 0;
7471 }
7472
7473 proc_list_lock();
7474
7475 if (more_free) {
7476 memorystatus_jetsam_policy |= kPolicyMoreFree;
7477 } else {
7478 memorystatus_jetsam_policy &= ~kPolicyMoreFree;
7479 }
7480
7481 memorystatus_update_levels_locked(TRUE);
7482
7483 proc_list_unlock();
7484
7485 return 0;
7486}
7487SYSCTL_PROC(_kern, OID_AUTO, memorystatus_policy_more_free, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
7488 0, 0, &sysctl_kern_memorystatus_policy_more_free, "I", "");
7489
5ba3f43e
A
7490#endif /* CONFIG_JETSAM */
7491
3e170ce0
A
7492/*
7493 * Get the at_boot snapshot
7494 */
39236c6e 7495static int
3e170ce0 7496memorystatus_get_at_boot_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only) {
39236c6e 7497 size_t input_size = *snapshot_size;
3e170ce0
A
7498
7499 /*
7500 * The at_boot snapshot has no entry list.
7501 */
7502 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t);
7503
7504 if (size_only) {
7505 return 0;
7506 }
7507
7508 /*
7509 * Validate the size of the snapshot buffer
7510 */
7511 if (input_size < *snapshot_size) {
7512 return EINVAL;
7513 }
7514
7515 /*
7516 * Update the notification_time only
7517 */
7518 memorystatus_at_boot_snapshot.notification_time = mach_absolute_time();
7519 *snapshot = &memorystatus_at_boot_snapshot;
7520
7521 MEMORYSTATUS_DEBUG(7, "memorystatus_get_at_boot_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%d)\n",
7522 (long)input_size, (long)*snapshot_size, 0);
7523 return 0;
7524}
7525
7526static int
7527memorystatus_get_on_demand_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only) {
7528 size_t input_size = *snapshot_size;
7529 uint32_t ods_list_count = memorystatus_list_count;
7530 memorystatus_jetsam_snapshot_t *ods = NULL; /* The on_demand snapshot buffer */
7531
7532 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (ods_list_count));
7533
7534 if (size_only) {
7535 return 0;
7536 }
7537
7538 /*
7539 * Validate the size of the snapshot buffer.
7540 * This is inherently racey. May want to revisit
7541 * this error condition and trim the output when
7542 * it doesn't fit.
7543 */
7544 if (input_size < *snapshot_size) {
7545 return EINVAL;
7546 }
7547
7548 /*
7549 * Allocate and initialize a snapshot buffer.
7550 */
7551 ods = (memorystatus_jetsam_snapshot_t *)kalloc(*snapshot_size);
7552 if (!ods) {
7553 return (ENOMEM);
7554 }
7555
7556 memset(ods, 0, *snapshot_size);
7557
7558 proc_list_lock();
7559 memorystatus_init_jetsam_snapshot_locked(ods, ods_list_count);
7560 proc_list_unlock();
7561
7562 /*
7563 * Return the kernel allocated, on_demand buffer.
7564 * The caller of this routine will copy the data out
7565 * to user space and then free the kernel allocated
7566 * buffer.
7567 */
7568 *snapshot = ods;
7569
7570 MEMORYSTATUS_DEBUG(7, "memorystatus_get_on_demand_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
7571 (long)input_size, (long)*snapshot_size, (long)ods_list_count);
316670eb 7572
3e170ce0
A
7573 return 0;
7574}
7575
7576static int
7577memorystatus_get_jetsam_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only) {
7578 size_t input_size = *snapshot_size;
7579
39236c6e
A
7580 if (memorystatus_jetsam_snapshot_count > 0) {
7581 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count));
7582 } else {
7583 *snapshot_size = 0;
7584 }
7585
7586 if (size_only) {
7587 return 0;
316670eb 7588 }
39236c6e
A
7589
7590 if (input_size < *snapshot_size) {
7591 return EINVAL;
7592 }
7593
7594 *snapshot = memorystatus_jetsam_snapshot;
3e170ce0
A
7595
7596 MEMORYSTATUS_DEBUG(7, "memorystatus_get_jetsam_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
7597 (long)input_size, (long)*snapshot_size, (long)memorystatus_jetsam_snapshot_count);
7598
39236c6e 7599 return 0;
316670eb
A
7600}
7601
fe8ab488 7602
316670eb 7603static int
3e170ce0 7604memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t buffer_size, int32_t *retval) {
39236c6e
A
7605 int error = EINVAL;
7606 boolean_t size_only;
3e170ce0
A
7607 boolean_t is_default_snapshot = FALSE;
7608 boolean_t is_on_demand_snapshot = FALSE;
7609 boolean_t is_at_boot_snapshot = FALSE;
39236c6e 7610 memorystatus_jetsam_snapshot_t *snapshot;
3e170ce0 7611
39236c6e 7612 size_only = ((buffer == USER_ADDR_NULL) ? TRUE : FALSE);
3e170ce0
A
7613
7614 if (flags == 0) {
7615 /* Default */
7616 is_default_snapshot = TRUE;
7617 error = memorystatus_get_jetsam_snapshot(&snapshot, &buffer_size, size_only);
7618 } else {
7619 if (flags & ~(MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT)) {
7620 /*
7621 * Unsupported bit set in flag.
7622 */
7623 return EINVAL;
7624 }
7625
7626 if ((flags & (MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT)) ==
7627 (MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT)) {
7628 /*
7629 * Can't have both set at the same time.
7630 */
7631 return EINVAL;
7632 }
7633
7634 if (flags & MEMORYSTATUS_SNAPSHOT_ON_DEMAND) {
7635 is_on_demand_snapshot = TRUE;
7636 /*
7637 * When not requesting the size only, the following call will allocate
7638 * an on_demand snapshot buffer, which is freed below.
7639 */
7640 error = memorystatus_get_on_demand_snapshot(&snapshot, &buffer_size, size_only);
7641
7642 } else if (flags & MEMORYSTATUS_SNAPSHOT_AT_BOOT) {
7643 is_at_boot_snapshot = TRUE;
7644 error = memorystatus_get_at_boot_snapshot(&snapshot, &buffer_size, size_only);
7645 } else {
7646 /*
7647 * Invalid flag setting.
7648 */
7649 return EINVAL;
7650 }
7651 }
7652
39236c6e
A
7653 if (error) {
7654 goto out;
7655 }
316670eb 7656
3e170ce0
A
7657 /*
7658 * Copy the data out to user space and clear the snapshot buffer.
7659 * If working with the jetsam snapshot,
7660 * clearing the buffer means, reset the count.
7661 * If working with an on_demand snapshot
7662 * clearing the buffer means, free it.
7663 * If working with the at_boot snapshot
7664 * there is nothing to clear or update.
7665 */
39236c6e
A
7666 if (!size_only) {
7667 if ((error = copyout(snapshot, buffer, buffer_size)) == 0) {
3e170ce0
A
7668 if (is_default_snapshot) {
7669 /*
7670 * The jetsam snapshot is never freed, its count is simply reset.
7671 */
3e170ce0 7672 proc_list_lock();
39037602 7673 snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
3e170ce0
A
7674 memorystatus_jetsam_snapshot_last_timestamp = 0;
7675 proc_list_unlock();
7676 }
7677 }
7678
7679 if (is_on_demand_snapshot) {
7680 /*
7681 * The on_demand snapshot is always freed,
7682 * even if the copyout failed.
7683 */
7684 if(snapshot) {
7685 kfree(snapshot, buffer_size);
7686 }
39236c6e
A
7687 }
7688 }
316670eb 7689
39236c6e
A
7690 if (error == 0) {
7691 *retval = buffer_size;
7692 }
7693out:
7694 return error;
7695}
316670eb 7696
fe8ab488
A
7697/*
7698 * Routine: memorystatus_cmd_grp_set_properties
7699 * Purpose: Update properties for a group of processes.
7700 *
7701 * Supported Properties:
7702 * [priority]
7703 * Move each process out of its effective priority
7704 * band and into a new priority band.
7705 * Maintains relative order from lowest to highest priority.
7706 * In single band, maintains relative order from head to tail.
7707 *
7708 * eg: before [effectivepriority | pid]
7709 * [18 | p101 ]
7710 * [17 | p55, p67, p19 ]
7711 * [12 | p103 p10 ]
7712 * [ 7 | p25 ]
7713 * [ 0 | p71, p82, ]
7714 *
7715 * after [ new band | pid]
7716 * [ xxx | p71, p82, p25, p103, p10, p55, p67, p19, p101]
7717 *
7718 * Returns: 0 on success, else non-zero.
7719 *
7720 * Caveat: We know there is a race window regarding recycled pids.
7721 * A process could be killed before the kernel can act on it here.
7722 * If a pid cannot be found in any of the jetsam priority bands,
7723 * then we simply ignore it. No harm.
7724 * But, if the pid has been recycled then it could be an issue.
7725 * In that scenario, we might move an unsuspecting process to the new
7726 * priority band. It's not clear how the kernel can safeguard
7727 * against this, but it would be an extremely rare case anyway.
7728 * The caller of this api might avoid such race conditions by
7729 * ensuring that the processes passed in the pid list are suspended.
7730 */
7731
7732
7733/* This internal structure can expand when we add support for more properties */
7734typedef struct memorystatus_internal_properties
7735{
7736 proc_t proc;
7737 int32_t priority; /* see memorytstatus_priority_entry_t : priority */
7738} memorystatus_internal_properties_t;
7739
7740
7741static int
7742memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
7743
7744#pragma unused (flags)
7745
7746 /*
7747 * We only handle setting priority
7748 * per process
7749 */
7750
7751 int error = 0;
7752 memorystatus_priority_entry_t *entries = NULL;
7753 uint32_t entry_count = 0;
7754
7755 /* This will be the ordered proc list */
7756 memorystatus_internal_properties_t *table = NULL;
7757 size_t table_size = 0;
7758 uint32_t table_count = 0;
7759
7760 uint32_t i = 0;
7761 uint32_t bucket_index = 0;
7762 boolean_t head_insert;
7763 int32_t new_priority;
7764
7765 proc_t p;
7766
7767 /* Verify inputs */
7768 if ((buffer == USER_ADDR_NULL) || (buffer_size == 0) || ((buffer_size % sizeof(memorystatus_priority_entry_t)) != 0)) {
7769 error = EINVAL;
7770 goto out;
7771 }
7772
7773 entry_count = (buffer_size / sizeof(memorystatus_priority_entry_t));
7774 if ((entries = (memorystatus_priority_entry_t *)kalloc(buffer_size)) == NULL) {
7775 error = ENOMEM;
7776 goto out;
7777 }
7778
7779 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, entry_count, 0, 0, 0, 0);
7780
7781 if ((error = copyin(buffer, entries, buffer_size)) != 0) {
7782 goto out;
7783 }
7784
7785 /* Verify sanity of input priorities */
7786 for (i=0; i < entry_count; i++) {
7787 if (entries[i].priority == -1) {
7788 /* Use as shorthand for default priority */
7789 entries[i].priority = JETSAM_PRIORITY_DEFAULT;
39037602
A
7790 } else if ((entries[i].priority == system_procs_aging_band) || (entries[i].priority == applications_aging_band)) {
7791 /* Both the aging bands are reserved for internal use;
fe8ab488
A
7792 * if requested, adjust to JETSAM_PRIORITY_IDLE. */
7793 entries[i].priority = JETSAM_PRIORITY_IDLE;
7794 } else if (entries[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
7795 /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle
7796 * queue */
7797 /* Deal with this later */
7798 } else if ((entries[i].priority < 0) || (entries[i].priority >= MEMSTAT_BUCKET_COUNT)) {
7799 /* Sanity check */
7800 error = EINVAL;
7801 goto out;
7802 }
7803 }
7804
7805 table_size = sizeof(memorystatus_internal_properties_t) * entry_count;
7806 if ( (table = (memorystatus_internal_properties_t *)kalloc(table_size)) == NULL) {
7807 error = ENOMEM;
7808 goto out;
7809 }
7810 memset(table, 0, table_size);
7811
7812
7813 /*
7814 * For each jetsam bucket entry, spin through the input property list.
7815 * When a matching pid is found, populate an adjacent table with the
7816 * appropriate proc pointer and new property values.
7817 * This traversal automatically preserves order from lowest
7818 * to highest priority.
7819 */
7820
7821 bucket_index=0;
7822
7823 proc_list_lock();
7824
7825 /* Create the ordered table */
7826 p = memorystatus_get_first_proc_locked(&bucket_index, TRUE);
7827 while (p && (table_count < entry_count)) {
7828 for (i=0; i < entry_count; i++ ) {
7829 if (p->p_pid == entries[i].pid) {
7830 /* Build the table data */
7831 table[table_count].proc = p;
7832 table[table_count].priority = entries[i].priority;
7833 table_count++;
7834 break;
7835 }
7836 }
7837 p = memorystatus_get_next_proc_locked(&bucket_index, p, TRUE);
7838 }
7839
7840 /* We now have ordered list of procs ready to move */
7841 for (i=0; i < table_count; i++) {
7842 p = table[i].proc;
7843 assert(p != NULL);
7844
7845 /* Allow head inserts -- but relative order is now */
7846 if (table[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
7847 new_priority = JETSAM_PRIORITY_IDLE;
7848 head_insert = true;
7849 } else {
7850 new_priority = table[i].priority;
7851 head_insert = false;
7852 }
7853
7854 /* Not allowed */
7855 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
7856 continue;
7857 }
7858
7859 /*
39037602
A
7860 * Take appropriate steps if moving proc out of
7861 * either of the aging bands.
fe8ab488 7862 */
39037602 7863 if ((p->p_memstat_effectivepriority == system_procs_aging_band) || (p->p_memstat_effectivepriority == applications_aging_band)) {
fe8ab488
A
7864 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
7865 }
7866
39037602 7867 memorystatus_update_priority_locked(p, new_priority, head_insert, false);
fe8ab488
A
7868 }
7869
7870 proc_list_unlock();
7871
7872 /*
7873 * if (table_count != entry_count)
7874 * then some pids were not found in a jetsam band.
7875 * harmless but interesting...
7876 */
7877 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, entry_count, table_count, 0, 0, 0);
7878
7879out:
7880 if (entries)
7881 kfree(entries, buffer_size);
7882 if (table)
7883 kfree(table, table_size);
7884
7885 return (error);
7886}
7887
7888
7889/*
3e170ce0
A
7890 * This routine is used to update a process's jetsam priority position and stored user_data.
7891 * It is not used for the setting of memory limits, which is why the last 6 args to the
7892 * memorystatus_update() call are 0 or FALSE.
fe8ab488
A
7893 */
7894
39236c6e
A
7895static int
7896memorystatus_cmd_set_priority_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
3e170ce0
A
7897 int error = 0;
7898 memorystatus_priority_properties_t mpp_entry;
7899
39236c6e 7900 /* Validate inputs */
3e170ce0 7901 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_priority_properties_t))) {
39236c6e
A
7902 return EINVAL;
7903 }
7904
3e170ce0
A
7905 error = copyin(buffer, &mpp_entry, buffer_size);
7906
7907 if (error == 0) {
39236c6e
A
7908 proc_t p;
7909
39236c6e
A
7910 p = proc_find(pid);
7911 if (!p) {
3e170ce0 7912 return ESRCH;
39236c6e
A
7913 }
7914
7915 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
39236c6e 7916 proc_rele(p);
3e170ce0 7917 return EPERM;
39236c6e 7918 }
fe8ab488 7919
5ba3f43e 7920 error = memorystatus_update(p, mpp_entry.priority, mpp_entry.user_data, FALSE, FALSE, 0, 0, FALSE, FALSE);
39236c6e
A
7921 proc_rele(p);
7922 }
7923
3e170ce0
A
7924 return(error);
7925}
7926
7927static int
7928memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
7929 int error = 0;
7930 memorystatus_memlimit_properties_t mmp_entry;
7931
7932 /* Validate inputs */
7933 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_memlimit_properties_t))) {
7934 return EINVAL;
7935 }
7936
7937 error = copyin(buffer, &mmp_entry, buffer_size);
7938
7939 if (error == 0) {
7940 error = memorystatus_set_memlimit_properties(pid, &mmp_entry);
7941 }
7942
7943 return(error);
7944}
7945
7946/*
7947 * When getting the memlimit settings, we can't simply call task_get_phys_footprint_limit().
7948 * That gets the proc's cached memlimit and there is no guarantee that the active/inactive
7949 * limits will be the same in the no-limit case. Instead we convert limits <= 0 using
7950 * task_convert_phys_footprint_limit(). It computes the same limit value that would be written
7951 * to the task's ledgers via task_set_phys_footprint_limit().
7952 */
7953static int
7954memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
7955 int error = 0;
7956 memorystatus_memlimit_properties_t mmp_entry;
7957
7958 /* Validate inputs */
7959 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_memlimit_properties_t))) {
7960 return EINVAL;
7961 }
7962
7963 memset (&mmp_entry, 0, sizeof(memorystatus_memlimit_properties_t));
7964
7965 proc_t p = proc_find(pid);
7966 if (!p) {
7967 return ESRCH;
7968 }
7969
7970 /*
7971 * Get the active limit and attributes.
7972 * No locks taken since we hold a reference to the proc.
7973 */
7974
7975 if (p->p_memstat_memlimit_active > 0 ) {
7976 mmp_entry.memlimit_active = p->p_memstat_memlimit_active;
7977 } else {
7978 task_convert_phys_footprint_limit(-1, &mmp_entry.memlimit_active);
7979 }
7980
7981 if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL) {
7982 mmp_entry.memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7983 }
7984
7985 /*
7986 * Get the inactive limit and attributes
7987 */
7988 if (p->p_memstat_memlimit_inactive <= 0) {
7989 task_convert_phys_footprint_limit(-1, &mmp_entry.memlimit_inactive);
7990 } else {
7991 mmp_entry.memlimit_inactive = p->p_memstat_memlimit_inactive;
7992 }
7993 if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) {
7994 mmp_entry.memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7995 }
7996 proc_rele(p);
7997
7998 error = copyout(&mmp_entry, buffer, buffer_size);
7999
8000 return(error);
b0d623f7
A
8001}
8002
3e170ce0 8003
39037602
A
8004/*
8005 * SPI for kbd - pr24956468
8006 * This is a very simple snapshot that calculates how much a
8007 * process's phys_footprint exceeds a specific memory limit.
8008 * Only the inactive memory limit is supported for now.
8009 * The delta is returned as bytes in excess or zero.
8010 */
8011static int
8012memorystatus_cmd_get_memlimit_excess_np(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
8013 int error = 0;
8014 uint64_t footprint_in_bytes = 0;
8015 uint64_t delta_in_bytes = 0;
8016 int32_t memlimit_mb = 0;
8017 uint64_t memlimit_bytes = 0;
8018
8019 /* Validate inputs */
8020 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(uint64_t)) || (flags != 0)) {
8021 return EINVAL;
8022 }
8023
8024 proc_t p = proc_find(pid);
8025 if (!p) {
8026 return ESRCH;
8027 }
8028
8029 /*
8030 * Get the inactive limit.
8031 * No locks taken since we hold a reference to the proc.
8032 */
8033
8034 if (p->p_memstat_memlimit_inactive <= 0) {
8035 task_convert_phys_footprint_limit(-1, &memlimit_mb);
8036 } else {
8037 memlimit_mb = p->p_memstat_memlimit_inactive;
8038 }
8039
8040 footprint_in_bytes = get_task_phys_footprint(p->task);
8041
8042 proc_rele(p);
8043
8044 memlimit_bytes = memlimit_mb * 1024 * 1024; /* MB to bytes */
8045
8046 /*
8047 * Computed delta always returns >= 0 bytes
8048 */
8049 if (footprint_in_bytes > memlimit_bytes) {
8050 delta_in_bytes = footprint_in_bytes - memlimit_bytes;
8051 }
8052
8053 error = copyout(&delta_in_bytes, buffer, sizeof(delta_in_bytes));
8054
8055 return(error);
8056}
8057
8058
39236c6e
A
8059static int
8060memorystatus_cmd_get_pressure_status(int32_t *retval) {
8061 int error;
8062
8063 /* Need privilege for check */
8064 error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
8065 if (error) {
8066 return (error);
8067 }
8068
8069 /* Inherently racy, so it's not worth taking a lock here */
8070 *retval = (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
8071
8072 return error;
8073}
316670eb 8074
3e170ce0
A
8075int
8076memorystatus_get_pressure_status_kdp() {
8077 return (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
8078}
8079
fe8ab488
A
8080/*
8081 * Every process, including a P_MEMSTAT_INTERNAL process (currently only pid 1), is allowed to set a HWM.
3e170ce0
A
8082 *
8083 * This call is inflexible -- it does not distinguish between active/inactive, fatal/non-fatal
8084 * So, with 2-level HWM preserving previous behavior will map as follows.
8085 * - treat the limit passed in as both an active and inactive limit.
8086 * - treat the is_fatal_limit flag as though it applies to both active and inactive limits.
8087 *
8088 * When invoked via MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK
8089 * - the is_fatal_limit is FALSE, meaning the active and inactive limits are non-fatal/soft
8090 * - so mapping is (active/non-fatal, inactive/non-fatal)
8091 *
8092 * When invoked via MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT
8093 * - the is_fatal_limit is TRUE, meaning the process's active and inactive limits are fatal/hard
8094 * - so mapping is (active/fatal, inactive/fatal)
fe8ab488
A
8095 */
8096
5ba3f43e 8097#if CONFIG_JETSAM
b0d623f7 8098static int
fe8ab488 8099memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit) {
39236c6e 8100 int error = 0;
3e170ce0
A
8101 memorystatus_memlimit_properties_t entry;
8102
8103 entry.memlimit_active = high_water_mark;
8104 entry.memlimit_active_attr = 0;
8105 entry.memlimit_inactive = high_water_mark;
8106 entry.memlimit_inactive_attr = 0;
8107
8108 if (is_fatal_limit == TRUE) {
8109 entry.memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
8110 entry.memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
8111 }
8112
8113 error = memorystatus_set_memlimit_properties(pid, &entry);
8114 return (error);
8115}
5ba3f43e 8116#endif /* CONFIG_JETSAM */
3e170ce0
A
8117
8118static int
8119memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry) {
8120
8121 int32_t memlimit_active;
8122 boolean_t memlimit_active_is_fatal;
8123 int32_t memlimit_inactive;
8124 boolean_t memlimit_inactive_is_fatal;
8125 uint32_t valid_attrs = 0;
8126 int error = 0;
39236c6e
A
8127
8128 proc_t p = proc_find(pid);
8129 if (!p) {
8130 return ESRCH;
8131 }
3e170ce0
A
8132
8133 /*
8134 * Check for valid attribute flags.
8135 */
8136 valid_attrs |= (MEMORYSTATUS_MEMLIMIT_ATTR_FATAL);
8137 if ((entry->memlimit_active_attr & (~valid_attrs)) != 0) {
8138 proc_rele(p);
8139 return EINVAL;
8140 }
8141 if ((entry->memlimit_inactive_attr & (~valid_attrs)) != 0) {
8142 proc_rele(p);
8143 return EINVAL;
39236c6e 8144 }
fe8ab488 8145
3e170ce0
A
8146 /*
8147 * Setup the active memlimit properties
8148 */
8149 memlimit_active = entry->memlimit_active;
8150 if (entry->memlimit_active_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL) {
8151 memlimit_active_is_fatal = TRUE;
8152 } else {
8153 memlimit_active_is_fatal = FALSE;
8154 }
fe8ab488 8155
3e170ce0
A
8156 /*
8157 * Setup the inactive memlimit properties
8158 */
8159 memlimit_inactive = entry->memlimit_inactive;
8160 if (entry->memlimit_inactive_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL) {
8161 memlimit_inactive_is_fatal = TRUE;
8162 } else {
8163 memlimit_inactive_is_fatal = FALSE;
39236c6e
A
8164 }
8165
3e170ce0
A
8166 /*
8167 * Setting a limit of <= 0 implies that the process has no
8168 * high-water-mark and has no per-task-limit. That means
8169 * the system_wide task limit is in place, which by the way,
8170 * is always fatal.
8171 */
8172
8173 if (memlimit_active <= 0) {
8174 /*
8175 * Enforce the fatal system_wide task limit while process is active.
8176 */
8177 memlimit_active = -1;
8178 memlimit_active_is_fatal = TRUE;
8179 }
8180
8181 if (memlimit_inactive <= 0) {
8182 /*
8183 * Enforce the fatal system_wide task limit while process is inactive.
8184 */
8185 memlimit_inactive = -1;
8186 memlimit_inactive_is_fatal = TRUE;
8187 }
8188
8189 proc_list_lock();
8190
8191 /*
8192 * Store the active limit variants in the proc.
8193 */
8194 SET_ACTIVE_LIMITS_LOCKED(p, memlimit_active, memlimit_active_is_fatal);
8195
8196 /*
8197 * Store the inactive limit variants in the proc.
8198 */
8199 SET_INACTIVE_LIMITS_LOCKED(p, memlimit_inactive, memlimit_inactive_is_fatal);
8200
8201 /*
8202 * Enforce appropriate limit variant by updating the cached values
8203 * and writing the ledger.
8204 * Limit choice is based on process active/inactive state.
8205 */
8206
8207 if (memorystatus_highwater_enabled) {
813fb2f6
A
8208 boolean_t is_fatal;
8209 boolean_t use_active;
3e170ce0
A
8210
8211 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
813fb2f6
A
8212 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
8213 use_active = TRUE;
fe8ab488 8214 } else {
813fb2f6
A
8215 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
8216 use_active = FALSE;
fe8ab488 8217 }
3e170ce0
A
8218
8219 /* Enforce the limit by writing to the ledgers */
813fb2f6 8220 error = (task_set_phys_footprint_limit_internal(p->task, ((p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1), NULL, use_active, is_fatal) == 0) ? 0 : EINVAL;
3e170ce0
A
8221
8222 MEMORYSTATUS_DEBUG(3, "memorystatus_set_memlimit_properties: new limit on pid %d (%dMB %s) current priority (%d) dirty_state?=0x%x %s\n",
8223 p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
8224 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, p->p_memstat_dirty,
8225 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
39037602 8226 DTRACE_MEMORYSTATUS2(memorystatus_set_memlimit, proc_t, p, int32_t, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1));
fe8ab488
A
8227 }
8228
39236c6e
A
8229 proc_list_unlock();
8230 proc_rele(p);
8231
8232 return error;
8233}
8234
fe8ab488
A
8235/*
8236 * Returns the jetsam priority (effective or requested) of the process
8237 * associated with this task.
8238 */
8239int
8240proc_get_memstat_priority(proc_t p, boolean_t effective_priority)
8241{
8242 if (p) {
8243 if (effective_priority) {
8244 return p->p_memstat_effectivepriority;
8245 } else {
8246 return p->p_memstat_requestedpriority;
8247 }
8248 }
8249 return 0;
8250}
3e170ce0 8251
39236c6e
A
8252int
8253memorystatus_control(struct proc *p __unused, struct memorystatus_control_args *args, int *ret) {
8254 int error = EINVAL;
39037602 8255 os_reason_t jetsam_reason = OS_REASON_NULL;
39236c6e
A
8256
8257#if !CONFIG_JETSAM
8258 #pragma unused(ret)
39037602 8259 #pragma unused(jetsam_reason)
39236c6e
A
8260#endif
8261
39037602
A
8262 /* Need to be root or have entitlement */
8263 if (!kauth_cred_issuser(kauth_cred_get()) && !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT)) {
39236c6e
A
8264 error = EPERM;
8265 goto out;
b0d623f7 8266 }
39037602
A
8267
8268 /*
8269 * Sanity check.
8270 * Do not enforce it for snapshots.
8271 */
8272 if (args->command != MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT) {
8273 if (args->buffersize > MEMORYSTATUS_BUFFERSIZE_MAX) {
8274 error = EINVAL;
8275 goto out;
8276 }
39236c6e
A
8277 }
8278
8279 switch (args->command) {
8280 case MEMORYSTATUS_CMD_GET_PRIORITY_LIST:
5ba3f43e 8281 error = memorystatus_cmd_get_priority_list(args->pid, args->buffer, args->buffersize, ret);
39236c6e 8282 break;
39236c6e
A
8283 case MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES:
8284 error = memorystatus_cmd_set_priority_properties(args->pid, args->buffer, args->buffersize, ret);
8285 break;
3e170ce0
A
8286 case MEMORYSTATUS_CMD_SET_MEMLIMIT_PROPERTIES:
8287 error = memorystatus_cmd_set_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
8288 break;
8289 case MEMORYSTATUS_CMD_GET_MEMLIMIT_PROPERTIES:
8290 error = memorystatus_cmd_get_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
8291 break;
39037602
A
8292 case MEMORYSTATUS_CMD_GET_MEMLIMIT_EXCESS:
8293 error = memorystatus_cmd_get_memlimit_excess_np(args->pid, args->flags, args->buffer, args->buffersize, ret);
8294 break;
fe8ab488
A
8295 case MEMORYSTATUS_CMD_GRP_SET_PROPERTIES:
8296 error = memorystatus_cmd_grp_set_properties((int32_t)args->flags, args->buffer, args->buffersize, ret);
8297 break;
39236c6e 8298 case MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT:
3e170ce0 8299 error = memorystatus_cmd_get_jetsam_snapshot((int32_t)args->flags, args->buffer, args->buffersize, ret);
39236c6e
A
8300 break;
8301 case MEMORYSTATUS_CMD_GET_PRESSURE_STATUS:
8302 error = memorystatus_cmd_get_pressure_status(ret);
8303 break;
5ba3f43e 8304#if CONFIG_JETSAM
39236c6e 8305 case MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK:
3e170ce0
A
8306 /*
8307 * This call does not distinguish between active and inactive limits.
8308 * Default behavior in 2-level HWM world is to set both.
8309 * Non-fatal limit is also assumed for both.
8310 */
fe8ab488
A
8311 error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, FALSE);
8312 break;
8313 case MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT:
3e170ce0
A
8314 /*
8315 * This call does not distinguish between active and inactive limits.
8316 * Default behavior in 2-level HWM world is to set both.
8317 * Fatal limit is also assumed for both.
8318 */
fe8ab488 8319 error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, TRUE);
39236c6e 8320 break;
5ba3f43e 8321#endif /* CONFIG_JETSAM */
39236c6e
A
8322 /* Test commands */
8323#if DEVELOPMENT || DEBUG
8324 case MEMORYSTATUS_CMD_TEST_JETSAM:
39037602
A
8325 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_GENERIC);
8326 if (jetsam_reason == OS_REASON_NULL) {
8327 printf("memorystatus_control: failed to allocate jetsam reason\n");
8328 }
8329
8330 error = memorystatus_kill_process_sync(args->pid, kMemorystatusKilled, jetsam_reason) ? 0 : EINVAL;
39236c6e 8331 break;
3e170ce0
A
8332 case MEMORYSTATUS_CMD_TEST_JETSAM_SORT:
8333 error = memorystatus_cmd_test_jetsam_sort(args->pid, (int32_t)args->flags);
8334 break;
5ba3f43e 8335#if CONFIG_JETSAM
39236c6e
A
8336 case MEMORYSTATUS_CMD_SET_JETSAM_PANIC_BITS:
8337 error = memorystatus_cmd_set_panic_bits(args->buffer, args->buffersize);
8338 break;
5ba3f43e 8339#endif /* CONFIG_JETSAM */
39037602
A
8340#else /* DEVELOPMENT || DEBUG */
8341 #pragma unused(jetsam_reason)
39236c6e 8342#endif /* DEVELOPMENT || DEBUG */
490019cf
A
8343 case MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_ENABLE:
8344 if (memorystatus_aggressive_jetsam_lenient_allowed == FALSE) {
8345#if DEVELOPMENT || DEBUG
8346 printf("Enabling Lenient Mode\n");
8347#endif /* DEVELOPMENT || DEBUG */
8348
8349 memorystatus_aggressive_jetsam_lenient_allowed = TRUE;
8350 memorystatus_aggressive_jetsam_lenient = TRUE;
39037602 8351 error = 0;
490019cf
A
8352 }
8353 break;
8354 case MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_DISABLE:
8355#if DEVELOPMENT || DEBUG
8356 printf("Disabling Lenient mode\n");
8357#endif /* DEVELOPMENT || DEBUG */
8358 memorystatus_aggressive_jetsam_lenient_allowed = FALSE;
8359 memorystatus_aggressive_jetsam_lenient = FALSE;
39037602 8360 error = 0;
490019cf 8361 break;
3e170ce0
A
8362 case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE:
8363 case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE:
8364 error = memorystatus_low_mem_privileged_listener(args->command);
8365 break;
39037602 8366
39037602
A
8367 case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE:
8368 case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE:
8369 error = memorystatus_update_inactive_jetsam_priority_band(args->pid, args->command, args->flags ? TRUE : FALSE);
8370 break;
39037602 8371
39236c6e
A
8372 default:
8373 break;
8374 }
8375
8376out:
8377 return error;
8378}
8379
8380
8381static int
5ba3f43e 8382filt_memorystatusattach(struct knote *kn, __unused struct kevent_internal_s *kev)
39236c6e 8383{
39037602
A
8384 int error;
8385
39236c6e 8386 kn->kn_flags |= EV_CLEAR;
39037602
A
8387 error = memorystatus_knote_register(kn);
8388 if (error) {
8389 kn->kn_flags = EV_ERROR;
8390 kn->kn_data = error;
8391 }
8392 return 0;
39236c6e
A
8393}
8394
8395static void
8396filt_memorystatusdetach(struct knote *kn)
8397{
8398 memorystatus_knote_unregister(kn);
8399}
8400
8401static int
8402filt_memorystatus(struct knote *kn __unused, long hint)
8403{
8404 if (hint) {
8405 switch (hint) {
8406 case kMemorystatusNoPressure:
8407 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
3e170ce0 8408 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
39236c6e
A
8409 }
8410 break;
8411 case kMemorystatusPressure:
8412 if (memorystatus_vm_pressure_level == kVMPressureWarning || memorystatus_vm_pressure_level == kVMPressureUrgent) {
8413 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
3e170ce0 8414 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
39236c6e
A
8415 }
8416 } else if (memorystatus_vm_pressure_level == kVMPressureCritical) {
8417
8418 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
3e170ce0 8419 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
39236c6e
A
8420 }
8421 }
8422 break;
fe8ab488
A
8423 case kMemorystatusLowSwap:
8424 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) {
3e170ce0 8425 kn->kn_fflags = NOTE_MEMORYSTATUS_LOW_SWAP;
fe8ab488
A
8426 }
8427 break;
39037602
A
8428
8429 case kMemorystatusProcLimitWarn:
8430 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
8431 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
8432 }
8433 break;
8434
8435 case kMemorystatusProcLimitCritical:
8436 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
8437 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
8438 }
8439 break;
8440
39236c6e
A
8441 default:
8442 break;
b0d623f7 8443 }
39236c6e 8444 }
813fb2f6
A
8445
8446#if 0
8447 if (kn->kn_fflags != 0) {
8448 proc_t knote_proc = knote_get_kq(kn)->kq_p;
8449 pid_t knote_pid = knote_proc->p_pid;
8450
8451 printf("filt_memorystatus: sending kn 0x%lx (event 0x%x) for pid (%d)\n",
8452 (unsigned long)kn, kn->kn_fflags, knote_pid);
8453 }
8454#endif
8455
39236c6e
A
8456 return (kn->kn_fflags != 0);
8457}
8458
39037602
A
8459static int
8460filt_memorystatustouch(struct knote *kn, struct kevent_internal_s *kev)
8461{
8462 int res;
813fb2f6 8463 int prev_kn_sfflags = 0;
39037602
A
8464
8465 memorystatus_klist_lock();
8466
8467 /*
8468 * copy in new kevent settings
8469 * (saving the "desired" data and fflags).
8470 */
813fb2f6
A
8471
8472 prev_kn_sfflags = kn->kn_sfflags;
8473 kn->kn_sfflags = (kev->fflags & EVFILT_MEMORYSTATUS_ALL_MASK);
8474
5ba3f43e 8475#if !CONFIG_EMBEDDED
813fb2f6
A
8476 /*
8477 * Only on desktop do we restrict notifications to
8478 * one per active/inactive state (soft limits only).
8479 */
8480 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
8481 /*
8482 * Is there previous state to preserve?
8483 */
8484 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
8485 /*
8486 * This knote was previously interested in proc_limit_warn,
8487 * so yes, preserve previous state.
8488 */
8489 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
8490 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
8491 }
8492 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
8493 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
8494 }
8495 } else {
8496 /*
8497 * This knote was not previously interested in proc_limit_warn,
8498 * but it is now. Set both states.
8499 */
8500 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
8501 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
8502 }
8503 }
8504
8505 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
8506 /*
8507 * Is there previous state to preserve?
8508 */
8509 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
8510 /*
8511 * This knote was previously interested in proc_limit_critical,
8512 * so yes, preserve previous state.
8513 */
8514 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
8515 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
8516 }
8517 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
8518 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
8519 }
8520 } else {
8521 /*
8522 * This knote was not previously interested in proc_limit_critical,
8523 * but it is now. Set both states.
8524 */
8525 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
8526 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
8527 }
8528 }
5ba3f43e 8529#endif /* !CONFIG_EMBEDDED */
39037602
A
8530
8531 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
8532 kn->kn_udata = kev->udata;
8533
8534 /*
8535 * reset the output flags based on a
8536 * combination of the old events and
8537 * the new desired event list.
8538 */
8539 //kn->kn_fflags &= kn->kn_sfflags;
8540
8541 res = (kn->kn_fflags != 0);
8542
8543 memorystatus_klist_unlock();
8544
8545 return res;
8546}
8547
8548static int
8549filt_memorystatusprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
8550{
8551#pragma unused(data)
8552 int res;
8553
8554 memorystatus_klist_lock();
8555 res = (kn->kn_fflags != 0);
8556 if (res) {
8557 *kev = kn->kn_kevent;
8558 kn->kn_flags |= EV_CLEAR; /* automatic */
8559 kn->kn_fflags = 0;
8560 kn->kn_data = 0;
8561 }
8562 memorystatus_klist_unlock();
8563
8564 return res;
8565}
8566
39236c6e
A
8567static void
8568memorystatus_klist_lock(void) {
8569 lck_mtx_lock(&memorystatus_klist_mutex);
8570}
8571
8572static void
8573memorystatus_klist_unlock(void) {
8574 lck_mtx_unlock(&memorystatus_klist_mutex);
8575}
8576
8577void
8578memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr) {
8579 lck_mtx_init(&memorystatus_klist_mutex, grp, attr);
8580 klist_init(&memorystatus_klist);
8581}
8582
8583int
8584memorystatus_knote_register(struct knote *kn) {
8585 int error = 0;
813fb2f6 8586
39236c6e 8587 memorystatus_klist_lock();
813fb2f6
A
8588
8589 /*
8590 * Support only userspace visible flags.
8591 */
5ba3f43e 8592 if ((kn->kn_sfflags & EVFILT_MEMORYSTATUS_ALL_MASK) == (unsigned int) kn->kn_sfflags) {
813fb2f6 8593
5ba3f43e 8594#if !CONFIG_EMBEDDED
813fb2f6
A
8595 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
8596 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
8597 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
8598 }
8599
8600 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
8601 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
8602 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
8603 }
5ba3f43e 8604#endif /* !CONFIG_EMBEDDED */
39236c6e 8605
3e170ce0 8606 KNOTE_ATTACH(&memorystatus_klist, kn);
39236c6e 8607
39236c6e
A
8608 } else {
8609 error = ENOTSUP;
b0d623f7 8610 }
39236c6e
A
8611
8612 memorystatus_klist_unlock();
8613
8614 return error;
b0d623f7
A
8615}
8616
39236c6e
A
8617void
8618memorystatus_knote_unregister(struct knote *kn __unused) {
8619 memorystatus_klist_lock();
8620 KNOTE_DETACH(&memorystatus_klist, kn);
8621 memorystatus_klist_unlock();
8622}
316670eb 8623
fe8ab488
A
8624
8625#if 0
39236c6e
A
8626#if CONFIG_JETSAM && VM_PRESSURE_EVENTS
8627static boolean_t
8628memorystatus_issue_pressure_kevent(boolean_t pressured) {
8629 memorystatus_klist_lock();
8630 KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure);
8631 memorystatus_klist_unlock();
8632 return TRUE;
8633}
39236c6e 8634#endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
fe8ab488 8635#endif /* 0 */
3e170ce0 8636
3e170ce0
A
8637/* Coalition support */
8638
8639/* sorting info for a particular priority bucket */
8640typedef struct memstat_sort_info {
8641 coalition_t msi_coal;
8642 uint64_t msi_page_count;
8643 pid_t msi_pid;
8644 int msi_ntasks;
8645} memstat_sort_info_t;
8646
8647/*
8648 * qsort from smallest page count to largest page count
8649 *
8650 * return < 0 for a < b
8651 * 0 for a == b
8652 * > 0 for a > b
8653 */
8654static int memstat_asc_cmp(const void *a, const void *b)
8655{
8656 const memstat_sort_info_t *msA = (const memstat_sort_info_t *)a;
8657 const memstat_sort_info_t *msB = (const memstat_sort_info_t *)b;
8658
8659 return (int)((uint64_t)msA->msi_page_count - (uint64_t)msB->msi_page_count);
8660}
8661
8662/*
8663 * Return the number of pids rearranged during this sort.
8664 */
8665static int
8666memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coal_sort_order)
8667{
8668#define MAX_SORT_PIDS 80
8669#define MAX_COAL_LEADERS 10
8670
8671 unsigned int b = bucket_index;
8672 int nleaders = 0;
8673 int ntasks = 0;
8674 proc_t p = NULL;
8675 coalition_t coal = COALITION_NULL;
8676 int pids_moved = 0;
8677 int total_pids_moved = 0;
8678 int i;
8679
8680 /*
8681 * The system is typically under memory pressure when in this
8682 * path, hence, we want to avoid dynamic memory allocation.
8683 */
8684 memstat_sort_info_t leaders[MAX_COAL_LEADERS];
8685 pid_t pid_list[MAX_SORT_PIDS];
8686
8687 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
8688 return(0);
8689 }
8690
8691 /*
8692 * Clear the array that holds coalition leader information
8693 */
8694 for (i=0; i < MAX_COAL_LEADERS; i++) {
8695 leaders[i].msi_coal = COALITION_NULL;
8696 leaders[i].msi_page_count = 0; /* will hold total coalition page count */
8697 leaders[i].msi_pid = 0; /* will hold coalition leader pid */
8698 leaders[i].msi_ntasks = 0; /* will hold the number of tasks in a coalition */
8699 }
8700
8701 p = memorystatus_get_first_proc_locked(&b, FALSE);
8702 while (p) {
8703 if (coalition_is_leader(p->task, COALITION_TYPE_JETSAM, &coal)) {
8704 if (nleaders < MAX_COAL_LEADERS) {
8705 int coal_ntasks = 0;
8706 uint64_t coal_page_count = coalition_get_page_count(coal, &coal_ntasks);
8707 leaders[nleaders].msi_coal = coal;
8708 leaders[nleaders].msi_page_count = coal_page_count;
8709 leaders[nleaders].msi_pid = p->p_pid; /* the coalition leader */
8710 leaders[nleaders].msi_ntasks = coal_ntasks;
8711 nleaders++;
8712 } else {
8713 /*
8714 * We've hit MAX_COAL_LEADERS meaning we can handle no more coalitions.
8715 * Abandoned coalitions will linger at the tail of the priority band
8716 * when this sort session ends.
8717 * TODO: should this be an assert?
8718 */
8719 printf("%s: WARNING: more than %d leaders in priority band [%d]\n",
8720 __FUNCTION__, MAX_COAL_LEADERS, bucket_index);
8721 break;
8722 }
8723 }
8724 p=memorystatus_get_next_proc_locked(&b, p, FALSE);
8725 }
8726
8727 if (nleaders == 0) {
8728 /* Nothing to sort */
8729 return(0);
8730 }
8731
8732 /*
8733 * Sort the coalition leader array, from smallest coalition page count
8734 * to largest coalition page count. When inserted in the priority bucket,
8735 * smallest coalition is handled first, resulting in the last to be jetsammed.
8736 */
8737 if (nleaders > 1) {
8738 qsort(leaders, nleaders, sizeof(memstat_sort_info_t), memstat_asc_cmp);
8739 }
8740
8741#if 0
8742 for (i = 0; i < nleaders; i++) {
8743 printf("%s: coal_leader[%d of %d] pid[%d] pages[%llu] ntasks[%d]\n",
8744 __FUNCTION__, i, nleaders, leaders[i].msi_pid, leaders[i].msi_page_count,
8745 leaders[i].msi_ntasks);
8746 }
8747#endif
8748
8749 /*
8750 * During coalition sorting, processes in a priority band are rearranged
8751 * by being re-inserted at the head of the queue. So, when handling a
8752 * list, the first process that gets moved to the head of the queue,
8753 * ultimately gets pushed toward the queue tail, and hence, jetsams last.
8754 *
8755 * So, for example, the coalition leader is expected to jetsam last,
8756 * after its coalition members. Therefore, the coalition leader is
8757 * inserted at the head of the queue first.
8758 *
8759 * After processing a coalition, the jetsam order is as follows:
8760 * undefs(jetsam first), extensions, xpc services, leader(jetsam last)
8761 */
8762
8763 /*
8764 * Coalition members are rearranged in the priority bucket here,
8765 * based on their coalition role.
8766 */
8767 total_pids_moved = 0;
8768 for (i=0; i < nleaders; i++) {
8769
8770 /* a bit of bookkeeping */
8771 pids_moved = 0;
8772
8773 /* Coalition leaders are jetsammed last, so move into place first */
8774 pid_list[0] = leaders[i].msi_pid;
8775 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list, 1);
8776
8777 /* xpc services should jetsam after extensions */
8778 ntasks = coalition_get_pid_list (leaders[i].msi_coal, COALITION_ROLEMASK_XPC,
8779 coal_sort_order, pid_list, MAX_SORT_PIDS);
8780
8781 if (ntasks > 0) {
8782 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
8783 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
8784 }
8785
8786 /* extensions should jetsam after unmarked processes */
8787 ntasks = coalition_get_pid_list (leaders[i].msi_coal, COALITION_ROLEMASK_EXT,
8788 coal_sort_order, pid_list, MAX_SORT_PIDS);
8789
8790 if (ntasks > 0) {
8791 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
8792 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
8793 }
8794
8795 /* undefined coalition members should be the first to jetsam */
8796 ntasks = coalition_get_pid_list (leaders[i].msi_coal, COALITION_ROLEMASK_UNDEF,
8797 coal_sort_order, pid_list, MAX_SORT_PIDS);
8798
8799 if (ntasks > 0) {
8800 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
8801 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
8802 }
8803
8804#if 0
8805 if (pids_moved == leaders[i].msi_ntasks) {
8806 /*
8807 * All the pids in the coalition were found in this band.
8808 */
8809 printf("%s: pids_moved[%d] equal total coalition ntasks[%d] \n", __FUNCTION__,
8810 pids_moved, leaders[i].msi_ntasks);
8811 } else if (pids_moved > leaders[i].msi_ntasks) {
8812 /*
8813 * Apparently new coalition members showed up during the sort?
8814 */
8815 printf("%s: pids_moved[%d] were greater than expected coalition ntasks[%d] \n", __FUNCTION__,
8816 pids_moved, leaders[i].msi_ntasks);
8817 } else {
8818 /*
8819 * Apparently not all the pids in the coalition were found in this band?
8820 */
8821 printf("%s: pids_moved[%d] were less than expected coalition ntasks[%d] \n", __FUNCTION__,
8822 pids_moved, leaders[i].msi_ntasks);
8823 }
8824#endif
8825
8826 total_pids_moved += pids_moved;
8827
8828 } /* end for */
8829
8830 return(total_pids_moved);
8831}
8832
8833
8834/*
8835 * Traverse a list of pids, searching for each within the priority band provided.
8836 * If pid is found, move it to the front of the priority band.
8837 * Never searches outside the priority band provided.
8838 *
8839 * Input:
8840 * bucket_index - jetsam priority band.
8841 * pid_list - pointer to a list of pids.
8842 * list_sz - number of pids in the list.
8843 *
8844 * Pid list ordering is important in that,
8845 * pid_list[n] is expected to jetsam ahead of pid_list[n+1].
8846 * The sort_order is set by the coalition default.
8847 *
8848 * Return:
8849 * the number of pids found and hence moved within the priority band.
8850 */
8851static int
8852memorystatus_move_list_locked(unsigned int bucket_index, pid_t *pid_list, int list_sz)
8853{
8854 memstat_bucket_t *current_bucket;
8855 int i;
8856 int found_pids = 0;
8857
8858 if ((pid_list == NULL) || (list_sz <= 0)) {
8859 return(0);
8860 }
8861
8862 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
8863 return(0);
8864 }
8865
8866 current_bucket = &memstat_bucket[bucket_index];
8867 for (i=0; i < list_sz; i++) {
8868 unsigned int b = bucket_index;
8869 proc_t p = NULL;
8870 proc_t aProc = NULL;
8871 pid_t aPid;
8872 int list_index;
8873
8874 list_index = ((list_sz - 1) - i);
8875 aPid = pid_list[list_index];
8876
8877 /* never search beyond bucket_index provided */
8878 p = memorystatus_get_first_proc_locked(&b, FALSE);
8879 while (p) {
8880 if (p->p_pid == aPid) {
8881 aProc = p;
8882 break;
8883 }
8884 p = memorystatus_get_next_proc_locked(&b, p, FALSE);
8885 }
8886
8887 if (aProc == NULL) {
8888 /* pid not found in this band, just skip it */
8889 continue;
8890 } else {
8891 TAILQ_REMOVE(&current_bucket->list, aProc, p_memstat_list);
8892 TAILQ_INSERT_HEAD(&current_bucket->list, aProc, p_memstat_list);
8893 found_pids++;
8894 }
8895 }
8896 return(found_pids);
8897}
5ba3f43e
A
8898
8899int
8900memorystatus_get_proccnt_upto_priority(int32_t max_bucket_index)
8901{
8902 int32_t i = JETSAM_PRIORITY_IDLE;
8903 int count = 0;
8904
8905 if (max_bucket_index >= MEMSTAT_BUCKET_COUNT) {
8906 return(-1);
8907 }
8908
8909 while(i <= max_bucket_index) {
8910 count += memstat_bucket[i++].count;
8911 }
8912
8913 return count;
8914}
8915
8916int
8917memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap)
8918{
8919#if !CONFIG_JETSAM
8920 if (!p || (!isApp(p)) || (p->p_memstat_state & P_MEMSTAT_INTERNAL)) {
8921 /*
8922 * Ineligible processes OR system processes e.g. launchd.
8923 */
8924 return -1;
8925 }
8926
8927 /*
8928 * For macOS only:
8929 * We would like to use memorystatus_update() here to move the processes
8930 * within the bands. Unfortunately memorystatus_update() calls
8931 * memorystatus_update_priority_locked() which uses any band transitions
8932 * as an indication to modify ledgers. For that it needs the task lock
8933 * and since we came into this function with the task lock held, we'll deadlock.
8934 *
8935 * Unfortunately we can't completely disable ledger updates because we still
8936 * need the ledger updates for a subset of processes i.e. daemons.
8937 * When all processes on all platforms support memory limits, we can simply call
8938 * memorystatus_update().
8939
8940 * It also has some logic to deal with 'aging' which, currently, is only applicable
8941 * on CONFIG_JETSAM configs. So, till every platform has CONFIG_JETSAM we'll need
8942 * to do this explicit band transition.
8943 */
8944
8945 memstat_bucket_t *current_bucket, *new_bucket;
8946 int32_t priority = 0;
8947
8948 proc_list_lock();
8949
8950 if (((p->p_listflag & P_LIST_EXITED) != 0) ||
8951 (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED))) {
8952 /*
8953 * If the process is on its way out OR
8954 * jetsam has alread tried and failed to kill this process,
8955 * let's skip the whole jetsam band transition.
8956 */
8957 proc_list_unlock();
8958 return(0);
8959 }
8960
8961 if (is_appnap) {
8962 current_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
8963 new_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
8964 priority = JETSAM_PRIORITY_IDLE;
8965 } else {
8966 if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
8967 /*
8968 * It is possible that someone pulled this process
8969 * out of the IDLE band without updating its app-nap
8970 * parameters.
8971 */
8972 proc_list_unlock();
8973 return (0);
8974 }
8975
8976 current_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
8977 new_bucket = &memstat_bucket[p->p_memstat_requestedpriority];
8978 priority = p->p_memstat_requestedpriority;
8979 }
8980
8981 TAILQ_REMOVE(&current_bucket->list, p, p_memstat_list);
8982 current_bucket->count--;
8983
8984 TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
8985 new_bucket->count++;
8986
8987 /*
8988 * Record idle start or idle delta.
8989 */
8990 if (p->p_memstat_effectivepriority == priority) {
8991 /*
8992 * This process is not transitioning between
8993 * jetsam priority buckets. Do nothing.
8994 */
8995 } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
8996 uint64_t now;
8997 /*
8998 * Transitioning out of the idle priority bucket.
8999 * Record idle delta.
9000 */
9001 assert(p->p_memstat_idle_start != 0);
9002 now = mach_absolute_time();
9003 if (now > p->p_memstat_idle_start) {
9004 p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
9005 }
9006 } else if (priority == JETSAM_PRIORITY_IDLE) {
9007 /*
9008 * Transitioning into the idle priority bucket.
9009 * Record idle start.
9010 */
9011 p->p_memstat_idle_start = mach_absolute_time();
9012 }
9013
9014 p->p_memstat_effectivepriority = priority;
9015
9016 proc_list_unlock();
9017
9018 return (0);
9019
9020#else /* !CONFIG_JETSAM */
9021 #pragma unused(p)
9022 #pragma unused(is_appnap)
9023 return -1;
9024#endif /* !CONFIG_JETSAM */
9025}