]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/kern_memorystatus.c
xnu-3248.20.55.tar.gz
[apple/xnu.git] / bsd / kern / kern_memorystatus.c
CommitLineData
2d21ac55
A
1/*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
2d21ac55 29
2d21ac55 30#include <kern/sched_prim.h>
6d2010ae 31#include <kern/kalloc.h>
316670eb 32#include <kern/assert.h>
6d2010ae 33#include <kern/debug.h>
fe8ab488 34#include <kern/locks.h>
2d21ac55
A
35#include <kern/task.h>
36#include <kern/thread.h>
316670eb 37#include <kern/host.h>
2d21ac55 38#include <libkern/libkern.h>
3e170ce0 39#include <mach/coalition.h>
316670eb 40#include <mach/mach_time.h>
b0d623f7 41#include <mach/task.h>
316670eb 42#include <mach/host_priv.h>
39236c6e
A
43#include <mach/mach_host.h>
44#include <pexpert/pexpert.h>
3e170ce0 45#include <sys/coalition.h>
316670eb 46#include <sys/kern_event.h>
b0d623f7 47#include <sys/proc.h>
39236c6e 48#include <sys/proc_info.h>
b0d623f7
A
49#include <sys/signal.h>
50#include <sys/signalvar.h>
2d21ac55 51#include <sys/sysctl.h>
316670eb 52#include <sys/sysproto.h>
b0d623f7 53#include <sys/wait.h>
6d2010ae 54#include <sys/tree.h>
316670eb 55#include <sys/priv.h>
39236c6e
A
56#include <vm/vm_pageout.h>
57#include <vm/vm_protos.h>
6d2010ae
A
58
59#if CONFIG_FREEZE
6d2010ae 60#include <vm/vm_map.h>
39236c6e 61#endif /* CONFIG_FREEZE */
6d2010ae 62
316670eb 63#include <sys/kern_memorystatus.h>
6d2010ae 64
fe8ab488
A
65#if CONFIG_JETSAM
66/* For logging clarity */
67static const char *jetsam_kill_cause_name[] = {
68 "" ,
69 "jettisoned" , /* kMemorystatusKilled */
70 "highwater" , /* kMemorystatusKilledHiwat */
71 "vnode-limit" , /* kMemorystatusKilledVnodes */
72 "vm-pageshortage" , /* kMemorystatusKilledVMPageShortage */
73 "vm-thrashing" , /* kMemorystatusKilledVMThrashing */
74 "fc-thrashing" , /* kMemorystatusKilledFCThrashing */
75 "per-process-limit" , /* kMemorystatusKilledPerProcessLimit */
76 "diagnostic" , /* kMemorystatusKilledDiagnostic */
77 "idle-exit" , /* kMemorystatusKilledIdleExit */
78};
79
80/* Does cause indicate vm or fc thrashing? */
81static boolean_t
82is_thrashing(unsigned cause)
83{
84 switch (cause) {
85 case kMemorystatusKilledVMThrashing:
86 case kMemorystatusKilledFCThrashing:
87 return TRUE;
88 default:
89 return FALSE;
90 }
91}
92
93/* Callback into vm_compressor.c to signal that thrashing has been mitigated. */
94extern void vm_thrashing_jetsam_done(void);
95#endif
96
316670eb
A
97/* These are very verbose printfs(), enable with
98 * MEMORYSTATUS_DEBUG_LOG
99 */
100#if MEMORYSTATUS_DEBUG_LOG
101#define MEMORYSTATUS_DEBUG(cond, format, ...) \
102do { \
103 if (cond) { printf(format, ##__VA_ARGS__); } \
104} while(0)
105#else
106#define MEMORYSTATUS_DEBUG(cond, format, ...)
107#endif
6d2010ae 108
3e170ce0
A
109/*
110 * Active / Inactive limit support
111 * proc list must be locked
112 *
113 * The SET_*** macros are used to initialize a limit
114 * for the first time.
115 *
116 * The CACHE_*** macros are use to cache the limit that will
117 * soon be in effect down in the ledgers.
118 */
119
120#define SET_ACTIVE_LIMITS_LOCKED(p, limit, is_fatal) \
121MACRO_BEGIN \
122(p)->p_memstat_memlimit_active = (limit); \
123 (p)->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_ACTIVE_EXC_TRIGGERED; \
124 if (is_fatal) { \
125 (p)->p_memstat_state |= P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL; \
126 } else { \
127 (p)->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL; \
128 } \
129MACRO_END
130
131#define SET_INACTIVE_LIMITS_LOCKED(p, limit, is_fatal) \
132MACRO_BEGIN \
133(p)->p_memstat_memlimit_inactive = (limit); \
134 (p)->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_INACTIVE_EXC_TRIGGERED; \
135 if (is_fatal) { \
136 (p)->p_memstat_state |= P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL; \
137 } else { \
138 (p)->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL; \
139 } \
140MACRO_END
141
142#define CACHE_ACTIVE_LIMITS_LOCKED(p, trigger_exception) \
143MACRO_BEGIN \
144(p)->p_memstat_memlimit = (p)->p_memstat_memlimit_active; \
145 if ((p)->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL) { \
146 (p)->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT; \
147 } else { \
148 (p)->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT; \
149 } \
150 if ((p)->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_EXC_TRIGGERED) { \
151 trigger_exception = FALSE; \
152 } else { \
153 trigger_exception = TRUE; \
154 } \
155MACRO_END
156
157#define CACHE_INACTIVE_LIMITS_LOCKED(p, trigger_exception) \
158MACRO_BEGIN \
159(p)->p_memstat_memlimit = (p)->p_memstat_memlimit_inactive; \
160 if ((p)->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) { \
161 (p)->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT; \
162 } else { \
163 (p)->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT; \
164 } \
165 if ((p)->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_EXC_TRIGGERED) { \
166 trigger_exception = FALSE; \
167 } else { \
168 trigger_exception = TRUE; \
169 } \
170MACRO_END
171
172
39236c6e
A
173/* General tunables */
174
175unsigned long delta_percentage = 5;
176unsigned long critical_threshold_percentage = 5;
177unsigned long idle_offset_percentage = 5;
178unsigned long pressure_threshold_percentage = 15;
179unsigned long freeze_threshold_percentage = 50;
180
316670eb 181/* General memorystatus stuff */
6d2010ae 182
39236c6e
A
183struct klist memorystatus_klist;
184static lck_mtx_t memorystatus_klist_mutex;
6d2010ae 185
39236c6e
A
186static void memorystatus_klist_lock(void);
187static void memorystatus_klist_unlock(void);
6d2010ae 188
39236c6e
A
189static uint64_t memorystatus_idle_delay_time = 0;
190
191/*
192 * Memorystatus kevents
193 */
194
195static int filt_memorystatusattach(struct knote *kn);
196static void filt_memorystatusdetach(struct knote *kn);
197static int filt_memorystatus(struct knote *kn, long hint);
198
199struct filterops memorystatus_filtops = {
200 .f_attach = filt_memorystatusattach,
201 .f_detach = filt_memorystatusdetach,
202 .f_event = filt_memorystatus,
203};
204
205enum {
fe8ab488
A
206 kMemorystatusNoPressure = 0x1,
207 kMemorystatusPressure = 0x2,
208 kMemorystatusLowSwap = 0x4
39236c6e
A
209};
210
211/* Idle guard handling */
212
213static int32_t memorystatus_scheduled_idle_demotions = 0;
214
215static thread_call_t memorystatus_idle_demotion_call;
216
217static void memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2);
218static void memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state);
219static void memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clean_state);
220static void memorystatus_reschedule_idle_demotion_locked(void);
6d2010ae 221
fe8ab488
A
222static void memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert);
223
224boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
225void memorystatus_send_low_swap_note(void);
39236c6e
A
226
227int memorystatus_wakeup = 0;
228
229unsigned int memorystatus_level = 0;
3e170ce0 230unsigned int memorystatus_early_boot_level = 0;
6d2010ae 231
316670eb 232static int memorystatus_list_count = 0;
6d2010ae 233
39236c6e 234#define MEMSTAT_BUCKET_COUNT (JETSAM_PRIORITY_MAX + 1)
6d2010ae 235
39236c6e
A
236typedef struct memstat_bucket {
237 TAILQ_HEAD(, proc) list;
238 int count;
239} memstat_bucket_t;
6d2010ae 240
39236c6e
A
241memstat_bucket_t memstat_bucket[MEMSTAT_BUCKET_COUNT];
242
243uint64_t memstat_idle_demotion_deadline = 0;
6d2010ae 244
316670eb 245static unsigned int memorystatus_dirty_count = 0;
6d2010ae 246
3e170ce0
A
247#if CONFIG_JETSAM
248SYSCTL_INT(_kern, OID_AUTO, max_task_pmem, CTLFLAG_RD|CTLFLAG_LOCKED|CTLFLAG_MASKED, &max_task_footprint_mb, 0, "");
249#endif // CONFIG_JETSAM
250
39236c6e
A
251
252int
253memorystatus_get_level(__unused struct proc *p, struct memorystatus_get_level_args *args, __unused int *ret)
254{
255 user_addr_t level = 0;
256
257 level = args->level;
258
259 if (copyout(&memorystatus_level, level, sizeof(memorystatus_level)) != 0) {
260 return EFAULT;
261 }
262
263 return 0;
264}
265
266static proc_t memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search);
267static proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search);
268
269static void memorystatus_thread(void *param __unused, wait_result_t wr __unused);
6d2010ae 270
316670eb
A
271/* Jetsam */
272
273#if CONFIG_JETSAM
274
3e170ce0
A
275static int memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit);
276
277static int memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
278
279static int memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry);
280
281static int memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
282
283static boolean_t proc_jetsam_state_is_active_locked(proc_t);
284
fe8ab488
A
285int proc_get_memstat_priority(proc_t, boolean_t);
286
39236c6e
A
287/* Kill processes exceeding their limit either under memory pressure (1), or as soon as possible (0) */
288#define LEGACY_HIWATER 1
289
fe8ab488 290static boolean_t memorystatus_idle_snapshot = 0;
39236c6e 291
3e170ce0 292static int memorystatus_highwater_enabled = 1; /* Update the cached memlimit data. This should be removed. */
316670eb 293
316670eb
A
294unsigned int memorystatus_delta = 0;
295
39236c6e 296static unsigned int memorystatus_available_pages_critical_base = 0;
fe8ab488 297//static unsigned int memorystatus_last_foreground_pressure_pages = (unsigned int)-1;
39236c6e 298static unsigned int memorystatus_available_pages_critical_idle_offset = 0;
316670eb 299
3e170ce0
A
300/* Jetsam Loop Detection */
301static boolean_t memorystatus_jld_enabled = TRUE; /* Enables jetsam loop detection on all devices */
302static uint32_t memorystatus_jld_eval_period_msecs = 0; /* Init pass sets this based on device memory size */
303static int memorystatus_jld_eval_aggressive_count = 3; /* Raise the priority max after 'n' aggressive loops */
304static int memorystatus_jld_eval_aggressive_priority_band_max = 15; /* Kill aggressively up through this band */
305
306#if DEVELOPMENT || DEBUG
307/*
308 * Jetsam Loop Detection tunables.
309 */
310
311SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_period_msecs, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_jld_eval_period_msecs, 0, "");
312SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_aggressive_count, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_jld_eval_aggressive_count, 0, "");
313SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_aggressive_priority_band_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_jld_eval_aggressive_priority_band_max, 0, "");
314#endif /* DEVELOPMENT || DEBUG */
315
39236c6e
A
316#if DEVELOPMENT || DEBUG
317static unsigned int memorystatus_jetsam_panic_debug = 0;
316670eb 318
39236c6e
A
319static unsigned int memorystatus_jetsam_policy = kPolicyDefault;
320static unsigned int memorystatus_jetsam_policy_offset_pages_diagnostic = 0;
3e170ce0 321static unsigned int memorystatus_debug_dump_this_bucket = 0;
39236c6e 322#endif
316670eb 323
fe8ab488
A
324static unsigned int memorystatus_thread_wasted_wakeup = 0;
325
326static uint32_t kill_under_pressure_cause = 0;
316670eb 327
3e170ce0
A
328/*
329 * default jetsam snapshot support
330 */
39236c6e
A
331static memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot;
332#define memorystatus_jetsam_snapshot_list memorystatus_jetsam_snapshot->entries
39236c6e
A
333static unsigned int memorystatus_jetsam_snapshot_count = 0;
334static unsigned int memorystatus_jetsam_snapshot_max = 0;
3e170ce0
A
335static uint64_t memorystatus_jetsam_snapshot_last_timestamp = 0;
336static uint64_t memorystatus_jetsam_snapshot_timeout = 0;
337#define JETSAM_SNAPSHOT_TIMEOUT_SECS 30
338
339/*
340 * snapshot support for memstats collected at boot.
341 */
342static memorystatus_jetsam_snapshot_t memorystatus_at_boot_snapshot;
316670eb 343
39236c6e 344static void memorystatus_clear_errors(void);
fe8ab488 345static void memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages);
39236c6e
A
346static uint32_t memorystatus_build_state(proc_t p);
347static void memorystatus_update_levels_locked(boolean_t critical_only);
fe8ab488 348//static boolean_t memorystatus_issue_pressure_kevent(boolean_t pressured);
39236c6e
A
349
350static boolean_t memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause);
3e170ce0
A
351static boolean_t memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, int32_t *priority, uint32_t *errors);
352static boolean_t memorystatus_kill_top_process_aggressive(boolean_t any, uint32_t cause, int aggr_count, int32_t priority_max, uint32_t *errors);
39236c6e
A
353#if LEGACY_HIWATER
354static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors);
355#endif
356
357static boolean_t memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause);
358static boolean_t memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause);
316670eb 359
3e170ce0
A
360/* Priority Band Sorting Routines */
361static int memorystatus_sort_bucket(unsigned int bucket_index, int sort_order);
362static int memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coal_sort_order);
363static void memorystatus_sort_by_largest_process_locked(unsigned int bucket_index);
364static int memorystatus_move_list_locked(unsigned int bucket_index, pid_t *pid_list, int list_sz);
365
366/* qsort routines */
367typedef int (*cmpfunc_t)(const void *a, const void *b);
368extern void qsort(void *a, size_t n, size_t es, cmpfunc_t cmp);
369static int memstat_asc_cmp(const void *a, const void *b);
370
39236c6e 371#endif /* CONFIG_JETSAM */
6d2010ae 372
316670eb 373/* VM pressure */
6d2010ae 374
fe8ab488
A
375extern unsigned int vm_page_free_count;
376extern unsigned int vm_page_active_count;
377extern unsigned int vm_page_inactive_count;
378extern unsigned int vm_page_throttled_count;
379extern unsigned int vm_page_purgeable_count;
380extern unsigned int vm_page_wire_count;
381
316670eb 382#if VM_PRESSURE_EVENTS
6d2010ae 383
39236c6e 384#include "vm_pressure.h"
6d2010ae 385
fe8ab488 386extern boolean_t memorystatus_warn_process(pid_t pid, boolean_t critical);
316670eb 387
39236c6e 388vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal;
316670eb 389
fe8ab488
A
390#if CONFIG_MEMORYSTATUS
391unsigned int memorystatus_available_pages = (unsigned int)-1;
392unsigned int memorystatus_available_pages_pressure = 0;
393unsigned int memorystatus_available_pages_critical = 0;
394unsigned int memorystatus_frozen_count = 0;
395unsigned int memorystatus_suspended_count = 0;
396
397/*
398 * We use this flag to signal if we have any HWM offenders
399 * on the system. This way we can reduce the number of wakeups
400 * of the memorystatus_thread when the system is between the
401 * "pressure" and "critical" threshold.
402 *
403 * The (re-)setting of this variable is done without any locks
404 * or synchronization simply because it is not possible (currently)
405 * to keep track of HWM offenders that drop down below their memory
406 * limit and/or exit. So, we choose to burn a couple of wasted wakeups
407 * by allowing the unguarded modification of this variable.
408 */
409boolean_t memorystatus_hwm_candidates = 0;
410
411static int memorystatus_send_note(int event_code, void *data, size_t data_length);
412#endif /* CONFIG_MEMORYSTATUS */
413
316670eb
A
414#endif /* VM_PRESSURE_EVENTS */
415
316670eb
A
416/* Freeze */
417
418#if CONFIG_FREEZE
419
316670eb
A
420boolean_t memorystatus_freeze_enabled = FALSE;
421int memorystatus_freeze_wakeup = 0;
422
3e170ce0
A
423lck_grp_attr_t *freezer_lck_grp_attr;
424lck_grp_t *freezer_lck_grp;
425static lck_mtx_t freezer_mutex;
426
316670eb
A
427static inline boolean_t memorystatus_can_freeze_processes(void);
428static boolean_t memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low);
429
430static void memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused);
431
432/* Thresholds */
433static unsigned int memorystatus_freeze_threshold = 0;
434
fe8ab488
A
435static unsigned int memorystatus_freeze_pages_min = 0;
436static unsigned int memorystatus_freeze_pages_max = 0;
316670eb
A
437
438static unsigned int memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT;
439
3e170ce0
A
440static unsigned int memorystatus_freeze_daily_mb_max = FREEZE_DAILY_MB_MAX_DEFAULT;
441
316670eb
A
442/* Stats */
443static uint64_t memorystatus_freeze_count = 0;
444static uint64_t memorystatus_freeze_pageouts = 0;
6d2010ae
A
445
446/* Throttling */
316670eb
A
447static throttle_interval_t throttle_intervals[] = {
448 { 60, 8, 0, 0, { 0, 0 }, FALSE }, /* 1 hour intermediate interval, 8x burst */
6d2010ae
A
449 { 24 * 60, 1, 0, 0, { 0, 0 }, FALSE }, /* 24 hour long interval, no burst */
450};
451
316670eb 452static uint64_t memorystatus_freeze_throttle_count = 0;
6d2010ae 453
39236c6e 454static unsigned int memorystatus_suspended_footprint_total = 0;
6d2010ae 455
3e170ce0
A
456extern uint64_t vm_swap_get_free_space(void);
457
458static boolean_t memorystatus_freeze_update_throttle();
459
39236c6e 460#endif /* CONFIG_FREEZE */
6d2010ae 461
316670eb 462/* Debug */
6d2010ae 463
fe8ab488
A
464extern struct knote *vm_find_knote_from_pid(pid_t, struct klist *);
465
6d2010ae 466#if DEVELOPMENT || DEBUG
6d2010ae 467
39236c6e
A
468#if CONFIG_JETSAM
469
3e170ce0
A
470static void
471memorystatus_debug_dump_bucket_locked (unsigned int bucket_index)
472{
473 proc_t p = NULL;
474 uint32_t pages = 0;
475 uint32_t pages_in_mb = 0;
476 unsigned int b = bucket_index;
477 boolean_t traverse_all_buckets = FALSE;
478
479 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
480 traverse_all_buckets = TRUE;
481 b = 0;
482 } else {
483 traverse_all_buckets = FALSE;
484 b = bucket_index;
485 }
486
487 /*
488 * Missing from this dump is the value actually
489 * stored in the ledger... also, format could be better.
490 */
491 printf("memorystatus_debug_dump ***START***\n");
492 printf("bucket [pid] [pages/pages-mb] state [EP / RP] dirty deadline [C-limit / A-limit / IA-limit] name\n");
493 p = memorystatus_get_first_proc_locked(&b, traverse_all_buckets);
494 while (p) {
495 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
496 pages_in_mb = (pages * 4096) /1024 / 1024;
497 printf("%d [%d] [%d/%dMB] 0x%x [%d / %d] 0x%x %lld [%d%s / %d%s / %d%s] %s\n",
498 b, p->p_pid, pages, pages_in_mb,
499 p->p_memstat_state, p->p_memstat_effectivepriority, p->p_memstat_requestedpriority, p->p_memstat_dirty, p->p_memstat_idledeadline,
500 p->p_memstat_memlimit,
501 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"),
502 p->p_memstat_memlimit_active,
503 (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL ? "F " : "NF"),
504 p->p_memstat_memlimit_inactive,
505 (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL ? "F " : "NF"),
506 (p->p_comm ? p->p_comm : "unknown"));
507 p = memorystatus_get_next_proc_locked(&b, p, traverse_all_buckets);
508 }
509 printf("memorystatus_debug_dump ***END***\n");
510}
511
512static int
513sysctl_memorystatus_debug_dump_bucket SYSCTL_HANDLER_ARGS
514{
515#pragma unused(oidp, arg2)
516 int bucket_index = 0;
517 int error;
518 error = SYSCTL_OUT(req, arg1, sizeof(int));
519 if (error || !req->newptr) {
520 return (error);
521 }
522 error = SYSCTL_IN(req, &bucket_index, sizeof(int));
523 if (error || !req->newptr) {
524 return (error);
525 }
526 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
527 /*
528 * All jetsam buckets will be dumped.
529 */
530 } else {
531 /*
532 * Only a single bucket will be dumped.
533 */
534 }
535
536 proc_list_lock();
537 memorystatus_debug_dump_bucket_locked(bucket_index);
538 proc_list_unlock();
539 memorystatus_debug_dump_this_bucket = bucket_index;
540 return (error);
541}
542
543/*
544 * Debug aid to look at jetsam buckets and proc jetsam fields.
545 * Use this sysctl to act on a particular jetsam bucket.
546 * Writing the sysctl triggers the dump.
547 * Usage: sysctl kern.memorystatus_debug_dump_this_bucket=<bucket_index>
548 */
549
550SYSCTL_PROC(_kern, OID_AUTO, memorystatus_debug_dump_this_bucket, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_debug_dump_this_bucket, 0, sysctl_memorystatus_debug_dump_bucket, "I", "");
551
552
39236c6e
A
553/* Debug aid to aid determination of limit */
554
555static int
556sysctl_memorystatus_highwater_enable SYSCTL_HANDLER_ARGS
557{
558#pragma unused(oidp, arg2)
559 proc_t p;
560 unsigned int b = 0;
561 int error, enable = 0;
39236c6e
A
562
563 error = SYSCTL_OUT(req, arg1, sizeof(int));
564 if (error || !req->newptr) {
565 return (error);
566 }
567
568 error = SYSCTL_IN(req, &enable, sizeof(int));
569 if (error || !req->newptr) {
570 return (error);
571 }
572
573 if (!(enable == 0 || enable == 1)) {
574 return EINVAL;
575 }
576
577 proc_list_lock();
578
579 p = memorystatus_get_first_proc_locked(&b, TRUE);
580 while (p) {
3e170ce0
A
581 boolean_t trigger_exception;
582
39236c6e 583 if (enable) {
3e170ce0
A
584 /*
585 * No need to consider P_MEMSTAT_MEMLIMIT_BACKGROUND anymore.
586 * Background limits are described via the inactive limit slots.
587 */
588
589 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
590 CACHE_ACTIVE_LIMITS_LOCKED(p, trigger_exception);
39236c6e 591 } else {
3e170ce0 592 CACHE_INACTIVE_LIMITS_LOCKED(p, trigger_exception);
39236c6e 593 }
3e170ce0 594
39236c6e 595 } else {
3e170ce0
A
596 /*
597 * Disabling limits does not touch the stored variants.
598 * Set the cached limit fields to system_wide defaults.
599 */
600 p->p_memstat_memlimit = -1;
601 p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
602 trigger_exception = TRUE;
fe8ab488 603 }
3e170ce0
A
604
605 /*
606 * Enforce the cached limit by writing to the ledger.
607 */
608 task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit: -1, NULL, trigger_exception);
609
39236c6e
A
610 p = memorystatus_get_next_proc_locked(&b, p, TRUE);
611 }
612
613 memorystatus_highwater_enabled = enable;
614
615 proc_list_unlock();
616
617 return 0;
3e170ce0 618
39236c6e
A
619}
620
fe8ab488
A
621SYSCTL_INT(_kern, OID_AUTO, memorystatus_idle_snapshot, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_idle_snapshot, 0, "");
622
39236c6e
A
623SYSCTL_PROC(_kern, OID_AUTO, memorystatus_highwater_enabled, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_highwater_enabled, 0, sysctl_memorystatus_highwater_enable, "I", "");
624
625SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
626SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_available_pages_critical, 0, "");
627SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_base, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_critical_base, 0, "");
39236c6e 628SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_idle_offset, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_critical_idle_offset, 0, "");
316670eb
A
629
630/* Diagnostic code */
39236c6e 631
316670eb
A
632enum {
633 kJetsamDiagnosticModeNone = 0,
634 kJetsamDiagnosticModeAll = 1,
635 kJetsamDiagnosticModeStopAtFirstActive = 2,
636 kJetsamDiagnosticModeCount
637} jetsam_diagnostic_mode = kJetsamDiagnosticModeNone;
638
639static int jetsam_diagnostic_suspended_one_active_proc = 0;
640
641static int
642sysctl_jetsam_diagnostic_mode SYSCTL_HANDLER_ARGS
643{
644#pragma unused(arg1, arg2)
645
646 const char *diagnosticStrings[] = {
647 "jetsam: diagnostic mode: resetting critical level.",
648 "jetsam: diagnostic mode: will examine all processes",
649 "jetsam: diagnostic mode: will stop at first active process"
650 };
651
652 int error, val = jetsam_diagnostic_mode;
653 boolean_t changed = FALSE;
654
655 error = sysctl_handle_int(oidp, &val, 0, req);
656 if (error || !req->newptr)
657 return (error);
658 if ((val < 0) || (val >= kJetsamDiagnosticModeCount)) {
659 printf("jetsam: diagnostic mode: invalid value - %d\n", val);
660 return EINVAL;
661 }
662
39236c6e 663 proc_list_lock();
316670eb
A
664
665 if ((unsigned int) val != jetsam_diagnostic_mode) {
666 jetsam_diagnostic_mode = val;
667
668 memorystatus_jetsam_policy &= ~kPolicyDiagnoseActive;
669
670 switch (jetsam_diagnostic_mode) {
671 case kJetsamDiagnosticModeNone:
672 /* Already cleared */
673 break;
674 case kJetsamDiagnosticModeAll:
675 memorystatus_jetsam_policy |= kPolicyDiagnoseAll;
676 break;
677 case kJetsamDiagnosticModeStopAtFirstActive:
678 memorystatus_jetsam_policy |= kPolicyDiagnoseFirst;
679 break;
680 default:
681 /* Already validated */
682 break;
683 }
684
39236c6e 685 memorystatus_update_levels_locked(FALSE);
316670eb
A
686 changed = TRUE;
687 }
688
39236c6e 689 proc_list_unlock();
316670eb
A
690
691 if (changed) {
692 printf("%s\n", diagnosticStrings[val]);
693 }
694
695 return (0);
696}
697
39236c6e 698SYSCTL_PROC(_debug, OID_AUTO, jetsam_diagnostic_mode, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED|CTLFLAG_ANYBODY,
316670eb
A
699 &jetsam_diagnostic_mode, 0, sysctl_jetsam_diagnostic_mode, "I", "Jetsam Diagnostic Mode");
700
39236c6e 701SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jetsam_policy_offset_pages_diagnostic, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_jetsam_policy_offset_pages_diagnostic, 0, "");
316670eb
A
702
703#if VM_PRESSURE_EVENTS
704
39236c6e 705SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_pressure, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_pressure, 0, "");
316670eb 706
316670eb 707
fe8ab488
A
708/*
709 * This routine is used for targeted notifications
710 * regardless of system memory pressure.
711 * "memnote" is the current user.
712 */
316670eb
A
713
714static int
715sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS
716{
717#pragma unused(arg1, arg2)
718
fe8ab488
A
719 int error = 0, pid = 0;
720 int ret = 0;
721 struct knote *kn = NULL;
3e170ce0 722 boolean_t found_knote = FALSE;
316670eb
A
723
724 error = sysctl_handle_int(oidp, &pid, 0, req);
725 if (error || !req->newptr)
726 return (error);
727
fe8ab488
A
728 /*
729 * We inspect 3 lists here for targeted notifications:
730 * - memorystatus_klist
731 * - vm_pressure_klist
732 * - vm_pressure_dormant_klist
733 *
734 * The vm_pressure_* lists are tied to the old VM_PRESSURE
735 * notification mechanism. We intend to stop using that
736 * mechanism and, in turn, get rid of the 2 lists and
737 * vm_dispatch_pressure_note_to_pid() too.
738 */
739
740 memorystatus_klist_lock();
3e170ce0
A
741
742 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
743 proc_t knote_proc = kn->kn_kq->kq_p;
744 pid_t knote_pid = knote_proc->p_pid;
745
746 if (knote_pid == pid) {
747 /*
748 * Forcibly send this pid a "warning" memory pressure notification.
749 */
750 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
751 found_knote = TRUE;
752 }
753 }
754
755 if (found_knote) {
756 KNOTE(&memorystatus_klist, 0);
757 ret = 0;
fe8ab488
A
758 } else {
759 ret = vm_dispatch_pressure_note_to_pid(pid, FALSE);
760 }
3e170ce0 761
fe8ab488
A
762 memorystatus_klist_unlock();
763
764 return ret;
316670eb
A
765}
766
767SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
768 0, 0, &sysctl_memorystatus_vm_pressure_send, "I", "");
769
770#endif /* VM_PRESSURE_EVENTS */
771
772#endif /* CONFIG_JETSAM */
773
774#if CONFIG_FREEZE
775
3e170ce0
A
776SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_daily_mb_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_daily_mb_max, 0, "");
777
39236c6e 778SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_threshold, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_threshold, 0, "");
316670eb 779
39236c6e
A
780SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_min, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_min, 0, "");
781SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_max, 0, "");
316670eb 782
39236c6e
A
783SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_count, "");
784SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_pageouts, "");
785SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_throttle_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_count, "");
786SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_min_processes, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_suspended_threshold, 0, "");
316670eb
A
787
788boolean_t memorystatus_freeze_throttle_enabled = TRUE;
39236c6e 789SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_throttle_enabled, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_enabled, 0, "");
316670eb
A
790
791/*
fe8ab488 792 * Manual trigger of freeze and thaw for dev / debug kernels only.
316670eb
A
793 */
794static int
795sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS
796{
797#pragma unused(arg1, arg2)
316670eb
A
798 int error, pid = 0;
799 proc_t p;
800
fe8ab488
A
801 if (memorystatus_freeze_enabled == FALSE) {
802 return ENOTSUP;
803 }
804
316670eb
A
805 error = sysctl_handle_int(oidp, &pid, 0, req);
806 if (error || !req->newptr)
807 return (error);
808
3e170ce0
A
809 if (pid == 2) {
810 vm_pageout_anonymous_pages();
811
812 return 0;
813 }
814
815 lck_mtx_lock(&freezer_mutex);
816
316670eb
A
817 p = proc_find(pid);
818 if (p != NULL) {
819 uint32_t purgeable, wired, clean, dirty;
820 boolean_t shared;
39236c6e
A
821 uint32_t max_pages = 0;
822
fe8ab488 823 if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) {
3e170ce0
A
824
825 unsigned int avail_swap_space = 0; /* in pages. */
826
827 if (DEFAULT_FREEZER_IS_ACTIVE) {
828 /*
829 * Freezer backed by default pager and swap file(s).
830 */
831 avail_swap_space = default_pager_swap_pages_free();
832 } else {
833 /*
834 * Freezer backed by the compressor and swap file(s)
835 * while will hold compressed data.
836 */
837 avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64;
838 }
839
840 max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max);
841
39236c6e 842 } else {
3e170ce0
A
843 /*
844 * We only have the compressor without any swap.
845 */
39236c6e
A
846 max_pages = UINT32_MAX - 1;
847 }
3e170ce0 848
39236c6e 849 error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
316670eb 850 proc_rele(p);
316670eb 851
39236c6e
A
852 if (error)
853 error = EIO;
3e170ce0
A
854
855 lck_mtx_unlock(&freezer_mutex);
39236c6e
A
856 return error;
857 }
3e170ce0
A
858
859 lck_mtx_unlock(&freezer_mutex);
316670eb
A
860 return EINVAL;
861}
862
863SYSCTL_PROC(_kern, OID_AUTO, memorystatus_freeze, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
864 0, 0, &sysctl_memorystatus_freeze, "I", "");
865
866static int
867sysctl_memorystatus_available_pages_thaw SYSCTL_HANDLER_ARGS
868{
869#pragma unused(arg1, arg2)
870
871 int error, pid = 0;
872 proc_t p;
873
fe8ab488
A
874 if (memorystatus_freeze_enabled == FALSE) {
875 return ENOTSUP;
876 }
877
316670eb
A
878 error = sysctl_handle_int(oidp, &pid, 0, req);
879 if (error || !req->newptr)
880 return (error);
881
882 p = proc_find(pid);
883 if (p != NULL) {
39236c6e 884 error = task_thaw(p->task);
316670eb 885 proc_rele(p);
39236c6e
A
886
887 if (error)
888 error = EIO;
889 return error;
316670eb
A
890 }
891
892 return EINVAL;
893}
894
895SYSCTL_PROC(_kern, OID_AUTO, memorystatus_thaw, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
896 0, 0, &sysctl_memorystatus_available_pages_thaw, "I", "");
6d2010ae 897
6d2010ae 898#endif /* CONFIG_FREEZE */
2d21ac55 899
fe8ab488
A
900#endif /* DEVELOPMENT || DEBUG */
901
39236c6e
A
902extern kern_return_t kernel_thread_start_priority(thread_continue_t continuation,
903 void *parameter,
904 integer_t priority,
905 thread_t *new_thread);
906
fe8ab488 907#if CONFIG_JETSAM
3e170ce0
A
908/*
909 * Picks the sorting routine for a given jetsam priority band.
910 *
911 * Input:
912 * bucket_index - jetsam priority band to be sorted.
913 * sort_order - JETSAM_SORT_xxx from kern_memorystatus.h
914 * Currently sort_order is only meaningful when handling
915 * coalitions.
916 *
917 * Return:
918 * 0 on success
919 * non-0 on failure
920 */
921static int memorystatus_sort_bucket(unsigned int bucket_index, int sort_order)
922{
923 int coal_sort_order;
924
925 /*
926 * Verify the jetsam priority
927 */
928 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
929 return(EINVAL);
930 }
931
932#if DEVELOPMENT || DEBUG
933 if (sort_order == JETSAM_SORT_DEFAULT) {
934 coal_sort_order = COALITION_SORT_DEFAULT;
935 } else {
936 coal_sort_order = sort_order; /* only used for testing scenarios */
937 }
938#else
939 /* Verify default */
940 if (sort_order == JETSAM_SORT_DEFAULT) {
941 coal_sort_order = COALITION_SORT_DEFAULT;
942 } else {
943 return(EINVAL);
944 }
945#endif
946
947 proc_list_lock();
948 switch (bucket_index) {
949 case JETSAM_PRIORITY_FOREGROUND:
950 if (memorystatus_sort_by_largest_coalition_locked(bucket_index, coal_sort_order) == 0) {
951 /*
952 * Fall back to per process sorting when zero coalitions are found.
953 */
954 memorystatus_sort_by_largest_process_locked(bucket_index);
955 }
956 break;
957 default:
958 memorystatus_sort_by_largest_process_locked(bucket_index);
959 break;
960 }
961 proc_list_unlock();
962
963 return(0);
964}
965
fe8ab488
A
966/*
967 * Sort processes by size for a single jetsam bucket.
968 */
969
970static void memorystatus_sort_by_largest_process_locked(unsigned int bucket_index)
971{
972 proc_t p = NULL, insert_after_proc = NULL, max_proc = NULL;
3e170ce0 973 proc_t next_p = NULL, prev_max_proc = NULL;
fe8ab488
A
974 uint32_t pages = 0, max_pages = 0;
975 memstat_bucket_t *current_bucket;
976
977 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
978 return;
979 }
980
981 current_bucket = &memstat_bucket[bucket_index];
982
983 p = TAILQ_FIRST(&current_bucket->list);
984
3e170ce0 985 while (p) {
fe8ab488
A
986 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
987 max_pages = pages;
3e170ce0
A
988 max_proc = p;
989 prev_max_proc = p;
990
991 while ((next_p = TAILQ_NEXT(p, p_memstat_list)) != NULL) {
992 /* traversing list until we find next largest process */
993 p=next_p;
fe8ab488 994 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
fe8ab488
A
995 if (pages > max_pages) {
996 max_pages = pages;
997 max_proc = p;
998 }
fe8ab488
A
999 }
1000
3e170ce0
A
1001 if (prev_max_proc != max_proc) {
1002 /* found a larger process, place it in the list */
fe8ab488 1003 TAILQ_REMOVE(&current_bucket->list, max_proc, p_memstat_list);
fe8ab488
A
1004 if (insert_after_proc == NULL) {
1005 TAILQ_INSERT_HEAD(&current_bucket->list, max_proc, p_memstat_list);
1006 } else {
1007 TAILQ_INSERT_AFTER(&current_bucket->list, insert_after_proc, max_proc, p_memstat_list);
1008 }
3e170ce0
A
1009 prev_max_proc = max_proc;
1010 }
fe8ab488 1011
3e170ce0 1012 insert_after_proc = max_proc;
fe8ab488 1013
3e170ce0 1014 p = TAILQ_NEXT(max_proc, p_memstat_list);
fe8ab488
A
1015 }
1016}
1017
1018#endif /* CONFIG_JETSAM */
1019
39236c6e
A
1020static proc_t memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search) {
1021 memstat_bucket_t *current_bucket;
1022 proc_t next_p;
1023
1024 if ((*bucket_index) >= MEMSTAT_BUCKET_COUNT) {
1025 return NULL;
1026 }
1027
1028 current_bucket = &memstat_bucket[*bucket_index];
1029 next_p = TAILQ_FIRST(&current_bucket->list);
1030 if (!next_p && search) {
1031 while (!next_p && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
1032 current_bucket = &memstat_bucket[*bucket_index];
1033 next_p = TAILQ_FIRST(&current_bucket->list);
1034 }
1035 }
1036
1037 return next_p;
1038}
1039
1040static proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search) {
1041 memstat_bucket_t *current_bucket;
1042 proc_t next_p;
1043
1044 if (!p || ((*bucket_index) >= MEMSTAT_BUCKET_COUNT)) {
1045 return NULL;
1046 }
1047
1048 next_p = TAILQ_NEXT(p, p_memstat_list);
1049 while (!next_p && search && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
1050 current_bucket = &memstat_bucket[*bucket_index];
1051 next_p = TAILQ_FIRST(&current_bucket->list);
1052 }
1053
1054 return next_p;
1055}
316670eb
A
1056
1057__private_extern__ void
1058memorystatus_init(void)
1059{
1060 thread_t thread = THREAD_NULL;
1061 kern_return_t result;
39236c6e
A
1062 int i;
1063
fe8ab488
A
1064#if CONFIG_FREEZE
1065 memorystatus_freeze_pages_min = FREEZE_PAGES_MIN;
1066 memorystatus_freeze_pages_max = FREEZE_PAGES_MAX;
1067#endif
1068
39236c6e
A
1069 nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_idle_delay_time);
1070
1071 /* Init buckets */
1072 for (i = 0; i < MEMSTAT_BUCKET_COUNT; i++) {
1073 TAILQ_INIT(&memstat_bucket[i].list);
1074 memstat_bucket[i].count = 0;
1075 }
316670eb 1076
39236c6e 1077 memorystatus_idle_demotion_call = thread_call_allocate((thread_call_func_t)memorystatus_perform_idle_demotion, NULL);
316670eb 1078
39236c6e
A
1079 /* Apply overrides */
1080 PE_get_default("kern.jetsam_delta", &delta_percentage, sizeof(delta_percentage));
1081 assert(delta_percentage < 100);
1082 PE_get_default("kern.jetsam_critical_threshold", &critical_threshold_percentage, sizeof(critical_threshold_percentage));
1083 assert(critical_threshold_percentage < 100);
1084 PE_get_default("kern.jetsam_idle_offset", &idle_offset_percentage, sizeof(idle_offset_percentage));
1085 assert(idle_offset_percentage < 100);
1086 PE_get_default("kern.jetsam_pressure_threshold", &pressure_threshold_percentage, sizeof(pressure_threshold_percentage));
1087 assert(pressure_threshold_percentage < 100);
1088 PE_get_default("kern.jetsam_freeze_threshold", &freeze_threshold_percentage, sizeof(freeze_threshold_percentage));
1089 assert(freeze_threshold_percentage < 100);
316670eb 1090
39236c6e 1091#if CONFIG_JETSAM
3e170ce0
A
1092 /* device tree can request to take snapshots for idle-exit kills by default */
1093 PE_get_default("kern.jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot));
1094
39236c6e 1095 memorystatus_delta = delta_percentage * atop_64(max_mem) / 100;
39236c6e 1096 memorystatus_available_pages_critical_idle_offset = idle_offset_percentage * atop_64(max_mem) / 100;
39236c6e
A
1097 memorystatus_available_pages_critical_base = (critical_threshold_percentage / delta_percentage) * memorystatus_delta;
1098
1099 memorystatus_jetsam_snapshot_max = maxproc;
1100 memorystatus_jetsam_snapshot =
1101 (memorystatus_jetsam_snapshot_t*)kalloc(sizeof(memorystatus_jetsam_snapshot_t) +
1102 sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max);
1103 if (!memorystatus_jetsam_snapshot) {
1104 panic("Could not allocate memorystatus_jetsam_snapshot");
1105 }
1106
3e170ce0
A
1107 nanoseconds_to_absolutetime((uint64_t)JETSAM_SNAPSHOT_TIMEOUT_SECS * NSEC_PER_SEC, &memorystatus_jetsam_snapshot_timeout);
1108
1109 memset(&memorystatus_at_boot_snapshot, 0, sizeof(memorystatus_jetsam_snapshot_t));
1110
39236c6e
A
1111 /* No contention at this point */
1112 memorystatus_update_levels_locked(FALSE);
3e170ce0
A
1113
1114 /* Jetsam Loop Detection */
1115 if (max_mem <= (512 * 1024 * 1024)) {
1116 /* 512 MB devices */
1117 memorystatus_jld_eval_period_msecs = 8000; /* 8000 msecs == 8 second window */
1118 } else {
1119 /* 1GB and larger devices */
1120 memorystatus_jld_eval_period_msecs = 6000; /* 6000 msecs == 6 second window */
1121 }
39236c6e
A
1122#endif
1123
316670eb 1124#if CONFIG_FREEZE
39236c6e 1125 memorystatus_freeze_threshold = (freeze_threshold_percentage / delta_percentage) * memorystatus_delta;
316670eb 1126#endif
39236c6e
A
1127
1128 result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &thread);
316670eb
A
1129 if (result == KERN_SUCCESS) {
1130 thread_deallocate(thread);
1131 } else {
1132 panic("Could not create memorystatus_thread");
1133 }
39236c6e 1134}
316670eb 1135
39236c6e
A
1136/* Centralised for the purposes of allowing panic-on-jetsam */
1137extern void
1138vm_wake_compactor_swapper(void);
316670eb 1139
fe8ab488
A
1140/*
1141 * The jetsam no frills kill call
1142 * Return: 0 on success
1143 * error code on failure (EINVAL...)
1144 */
1145static int
1146jetsam_do_kill(proc_t p, int jetsam_flags) {
1147 int error = 0;
1148 error = exit1_internal(p, W_EXITCODE(0, SIGKILL), (int *)NULL, FALSE, FALSE, jetsam_flags);
1149 return(error);
1150}
1151
1152/*
1153 * Wrapper for processes exiting with memorystatus details
1154 */
39236c6e
A
1155static boolean_t
1156memorystatus_do_kill(proc_t p, uint32_t cause) {
1157
fe8ab488
A
1158 int error = 0;
1159 __unused pid_t victim_pid = p->p_pid;
1160
1161 KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_START,
1162 victim_pid, cause, vm_page_free_count, 0, 0);
39236c6e
A
1163
1164#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
1165 if (memorystatus_jetsam_panic_debug & (1 << cause)) {
1166 panic("memorystatus_do_kill(): jetsam debug panic (cause: %d)", cause);
316670eb 1167 }
39236c6e
A
1168#else
1169#pragma unused(cause)
316670eb 1170#endif
39236c6e
A
1171 int jetsam_flags = P_LTERM_JETSAM;
1172 switch (cause) {
1173 case kMemorystatusKilledHiwat: jetsam_flags |= P_JETSAM_HIWAT; break;
1174 case kMemorystatusKilledVnodes: jetsam_flags |= P_JETSAM_VNODE; break;
1175 case kMemorystatusKilledVMPageShortage: jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break;
1176 case kMemorystatusKilledVMThrashing: jetsam_flags |= P_JETSAM_VMTHRASHING; break;
fe8ab488 1177 case kMemorystatusKilledFCThrashing: jetsam_flags |= P_JETSAM_FCTHRASHING; break;
39236c6e
A
1178 case kMemorystatusKilledPerProcessLimit: jetsam_flags |= P_JETSAM_PID; break;
1179 case kMemorystatusKilledIdleExit: jetsam_flags |= P_JETSAM_IDLEEXIT; break;
1180 }
fe8ab488
A
1181 error = jetsam_do_kill(p, jetsam_flags);
1182
1183 KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_END,
1184 victim_pid, cause, vm_page_free_count, error, 0);
39236c6e 1185
3e170ce0 1186 vm_wake_compactor_swapper();
fe8ab488
A
1187
1188 return (error == 0);
316670eb
A
1189}
1190
1191/*
1192 * Node manipulation
1193 */
1194
1195static void
39236c6e
A
1196memorystatus_check_levels_locked(void) {
1197#if CONFIG_JETSAM
1198 /* Update levels */
1199 memorystatus_update_levels_locked(TRUE);
1200#endif
1201}
316670eb 1202
39236c6e
A
1203static void
1204memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2)
1205{
1206 proc_t p;
1207 uint64_t current_time;
1208 memstat_bucket_t *demotion_bucket;
1209
1210 MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion()\n");
1211
1212 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_START, 0, 0, 0, 0, 0);
1213
1214 current_time = mach_absolute_time();
1215
1216 proc_list_lock();
316670eb 1217
39236c6e
A
1218 demotion_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE_DEFERRED];
1219 p = TAILQ_FIRST(&demotion_bucket->list);
1220
1221 while (p) {
1222 MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion() found %d\n", p->p_pid);
1223
1224 assert(p->p_memstat_idledeadline);
1225 assert(p->p_memstat_dirty & P_DIRTY_DEFER_IN_PROGRESS);
1226 assert((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED);
1227
1228 if (current_time >= p->p_memstat_idledeadline) {
1229#if DEBUG || DEVELOPMENT
1230 if (!(p->p_memstat_dirty & P_DIRTY_MARKED)) {
fe8ab488
A
1231 printf("memorystatus_perform_idle_demotion: moving process %d [%s] to idle band, but never dirtied (0x%x)!\n",
1232 p->p_pid, (p->p_comm ? p->p_comm : "(unknown)"), p->p_memstat_dirty);
39236c6e
A
1233 }
1234#endif
1235 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
fe8ab488 1236 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, false);
39236c6e
A
1237
1238 // The prior process has moved out of the demotion bucket, so grab the new head and continue
1239 p = TAILQ_FIRST(&demotion_bucket->list);
1240 continue;
316670eb 1241 }
39236c6e
A
1242
1243 // No further candidates
1244 break;
316670eb 1245 }
39236c6e
A
1246
1247 memorystatus_reschedule_idle_demotion_locked();
1248
1249 proc_list_unlock();
316670eb 1250
39236c6e 1251 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
316670eb
A
1252}
1253
1254static void
39236c6e
A
1255memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state)
1256{
fe8ab488
A
1257 boolean_t present_in_deferred_bucket = FALSE;
1258
1259 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1260 present_in_deferred_bucket = TRUE;
1261 }
1262
3e170ce0 1263 MEMORYSTATUS_DEBUG(1, "memorystatus_schedule_idle_demotion_locked: scheduling demotion to idle band for pid %d (dirty:0x%x, set_state %d, demotions %d).\n",
39236c6e 1264 p->p_pid, p->p_memstat_dirty, set_state, memorystatus_scheduled_idle_demotions);
316670eb 1265
fe8ab488 1266 assert((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED);
316670eb 1267
39236c6e
A
1268 if (set_state) {
1269 assert(p->p_memstat_idledeadline == 0);
fe8ab488 1270 p->p_memstat_dirty |= P_DIRTY_DEFER_IN_PROGRESS;
39236c6e 1271 p->p_memstat_idledeadline = mach_absolute_time() + memorystatus_idle_delay_time;
316670eb 1272 }
39236c6e 1273
fe8ab488 1274 assert(p->p_memstat_idledeadline);
39236c6e 1275
fe8ab488
A
1276 if (present_in_deferred_bucket == FALSE) {
1277 memorystatus_scheduled_idle_demotions++;
1278 }
316670eb
A
1279}
1280
39236c6e
A
1281static void
1282memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clear_state)
316670eb 1283{
fe8ab488
A
1284 boolean_t present_in_deferred_bucket = FALSE;
1285
1286 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1287 present_in_deferred_bucket = TRUE;
1288 assert(p->p_memstat_idledeadline);
1289 }
1290
3e170ce0 1291 MEMORYSTATUS_DEBUG(1, "memorystatus_invalidate_idle_demotion(): invalidating demotion to idle band for pid %d (clear_state %d, demotions %d).\n",
39236c6e
A
1292 p->p_pid, clear_state, memorystatus_scheduled_idle_demotions);
1293
39236c6e
A
1294
1295 if (clear_state) {
1296 p->p_memstat_idledeadline = 0;
1297 p->p_memstat_dirty &= ~P_DIRTY_DEFER_IN_PROGRESS;
316670eb 1298 }
39236c6e 1299
fe8ab488
A
1300 if (present_in_deferred_bucket == TRUE) {
1301 memorystatus_scheduled_idle_demotions--;
1302 }
1303
39236c6e 1304 assert(memorystatus_scheduled_idle_demotions >= 0);
316670eb
A
1305}
1306
1307static void
39236c6e
A
1308memorystatus_reschedule_idle_demotion_locked(void) {
1309 if (0 == memorystatus_scheduled_idle_demotions) {
1310 if (memstat_idle_demotion_deadline) {
1311 /* Transitioned 1->0, so cancel next call */
1312 thread_call_cancel(memorystatus_idle_demotion_call);
1313 memstat_idle_demotion_deadline = 0;
1314 }
1315 } else {
1316 memstat_bucket_t *demotion_bucket;
1317 proc_t p;
1318 demotion_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE_DEFERRED];
1319 p = TAILQ_FIRST(&demotion_bucket->list);
39236c6e 1320
fe8ab488
A
1321 assert(p && p->p_memstat_idledeadline);
1322
1323 if (memstat_idle_demotion_deadline != p->p_memstat_idledeadline){
1324 thread_call_enter_delayed(memorystatus_idle_demotion_call, p->p_memstat_idledeadline);
1325 memstat_idle_demotion_deadline = p->p_memstat_idledeadline;
39236c6e
A
1326 }
1327 }
316670eb
A
1328}
1329
1330/*
1331 * List manipulation
1332 */
1333
39236c6e
A
1334int
1335memorystatus_add(proc_t p, boolean_t locked)
316670eb 1336{
39236c6e
A
1337 memstat_bucket_t *bucket;
1338
3e170ce0 1339 MEMORYSTATUS_DEBUG(1, "memorystatus_list_add(): adding pid %d with priority %d.\n", p->p_pid, p->p_memstat_effectivepriority);
39236c6e
A
1340
1341 if (!locked) {
1342 proc_list_lock();
1343 }
1344
1345 /* Processes marked internal do not have priority tracked */
1346 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
1347 goto exit;
1348 }
1349
1350 bucket = &memstat_bucket[p->p_memstat_effectivepriority];
1351
fe8ab488
A
1352 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1353 assert(bucket->count == memorystatus_scheduled_idle_demotions);
1354 }
1355
39236c6e
A
1356 TAILQ_INSERT_TAIL(&bucket->list, p, p_memstat_list);
1357 bucket->count++;
316670eb 1358
39236c6e 1359 memorystatus_list_count++;
316670eb 1360
39236c6e
A
1361 memorystatus_check_levels_locked();
1362
1363exit:
1364 if (!locked) {
1365 proc_list_unlock();
1366 }
1367
1368 return 0;
1369}
316670eb 1370
3e170ce0
A
1371/*
1372 * Description:
1373 * Moves a process from one jetsam bucket to another.
1374 * which changes the LRU position of the process.
1375 *
1376 * Monitors transition between buckets and if necessary
1377 * will update cached memory limits accordingly.
1378 */
39236c6e 1379static void
fe8ab488 1380memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert)
39236c6e
A
1381{
1382 memstat_bucket_t *old_bucket, *new_bucket;
1383
1384 assert(priority < MEMSTAT_BUCKET_COUNT);
1385
1386 /* Ensure that exit isn't underway, leaving the proc retained but removed from its bucket */
1387 if ((p->p_listflag & P_LIST_EXITED) != 0) {
1388 return;
316670eb 1389 }
39236c6e 1390
3e170ce0 1391 MEMORYSTATUS_DEBUG(1, "memorystatus_update_priority_locked(): setting pid %d to priority %d, inserting at %s\n",
fe8ab488 1392 p->p_pid, priority, head_insert ? "head" : "tail");
316670eb 1393
39236c6e 1394 old_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
fe8ab488
A
1395 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1396 assert(old_bucket->count == (memorystatus_scheduled_idle_demotions + 1));
1397 }
1398
39236c6e
A
1399 TAILQ_REMOVE(&old_bucket->list, p, p_memstat_list);
1400 old_bucket->count--;
316670eb 1401
39236c6e 1402 new_bucket = &memstat_bucket[priority];
fe8ab488
A
1403 if (head_insert)
1404 TAILQ_INSERT_HEAD(&new_bucket->list, p, p_memstat_list);
1405 else
1406 TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
39236c6e 1407 new_bucket->count++;
3e170ce0 1408
39236c6e 1409#if CONFIG_JETSAM
3e170ce0
A
1410 if (memorystatus_highwater_enabled) {
1411 boolean_t trigger_exception;
1412
1413 /*
1414 * If cached limit data is updated, then the limits
1415 * will be enforced by writing to the ledgers.
1416 */
1417 boolean_t ledger_update_needed = TRUE;
fe8ab488
A
1418
1419 /*
3e170ce0
A
1420 * No need to consider P_MEMSTAT_MEMLIMIT_BACKGROUND anymore.
1421 * Background limits are described via the inactive limit slots.
1422 *
1423 * Here, we must update the cached memory limit if the task
1424 * is transitioning between:
1425 * active <--> inactive
1426 * FG <--> BG
1427 * but:
1428 * dirty <--> clean is ignored
1429 *
1430 * We bypass processes that have opted into dirty tracking because
1431 * a move between buckets does not imply a transition between the
1432 * dirty <--> clean state.
1433 * Setting limits on processes opted into dirty tracking is handled
1434 * in memorystatus_dirty_set() where the transition is very clear.
fe8ab488
A
1435 */
1436
3e170ce0
A
1437 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
1438
1439 ledger_update_needed = FALSE;
1440
1441 } else if ((priority >= JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority < JETSAM_PRIORITY_FOREGROUND)) {
1442 /*
1443 * inactive --> active
1444 * BG --> FG
1445 * assign active state
1446 */
1447 CACHE_ACTIVE_LIMITS_LOCKED(p, trigger_exception);
1448
1449 } else if ((priority < JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) {
1450 /*
1451 * active --> inactive
1452 * FG --> BG
1453 * assign inactive state
1454 */
1455 CACHE_INACTIVE_LIMITS_LOCKED(p, trigger_exception);
1456 } else {
1457 /*
1458 * The transition between jetsam priority buckets apparently did
1459 * not affect active/inactive state.
1460 * This is not unusual... especially during startup when
1461 * processes are getting established in their respective bands.
1462 */
1463 ledger_update_needed = FALSE;
1464 }
1465
1466 /*
1467 * Enforce the new limits by writing to the ledger
1468 */
1469 if (ledger_update_needed) {
1470 task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, trigger_exception);
1471
1472 MEMORYSTATUS_DEBUG(3, "memorystatus_update_priority_locked: new limit on pid %d (%dMB %s) priority old --> new (%d --> %d) dirty?=0x%x %s\n",
1473 p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
1474 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, priority, p->p_memstat_dirty,
1475 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
39236c6e
A
1476 }
1477 }
3e170ce0
A
1478
1479#endif /* CONFIG_JETSAM */
39236c6e
A
1480
1481 p->p_memstat_effectivepriority = priority;
1482
1483 memorystatus_check_levels_locked();
316670eb
A
1484}
1485
3e170ce0
A
1486/*
1487 *
1488 * Description: Update the jetsam priority and memory limit attributes for a given process.
1489 *
1490 * Parameters:
1491 * p init this process's jetsam information.
1492 * priority The jetsam priority band
1493 * user_data user specific data, unused by the kernel
1494 * effective guards against race if process's update already occurred
1495 * update_memlimit When true we know this is the init step via the posix_spawn path.
1496 *
1497 * memlimit_active Value in megabytes; The monitored footprint level while the
1498 * process is active. Exceeding it may result in termination
1499 * based on it's associated fatal flag.
1500 *
1501 * memlimit_active_is_fatal When a process is active and exceeds its memory footprint,
1502 * this describes whether or not it should be immediately fatal.
1503 *
1504 * memlimit_inactive Value in megabytes; The monitored footprint level while the
1505 * process is inactive. Exceeding it may result in termination
1506 * based on it's associated fatal flag.
1507 *
1508 * memlimit_inactive_is_fatal When a process is inactive and exceeds its memory footprint,
1509 * this describes whether or not it should be immediatly fatal.
1510 *
1511 * memlimit_background This process has a high-water-mark while in the background.
1512 * No longer meaningful. Background limits are described via
1513 * the inactive slots. Flag is ignored.
1514 *
1515 *
1516 * Returns: 0 Success
1517 * non-0 Failure
1518 */
1519
39236c6e 1520int
3e170ce0
A
1521memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t effective, boolean_t update_memlimit,
1522 int32_t memlimit_active, boolean_t memlimit_active_is_fatal,
1523 int32_t memlimit_inactive, boolean_t memlimit_inactive_is_fatal,
1524 __unused boolean_t memlimit_background)
316670eb 1525{
39236c6e 1526 int ret;
fe8ab488 1527 boolean_t head_insert = false;
39236c6e 1528
316670eb 1529#if !CONFIG_JETSAM
3e170ce0
A
1530#pragma unused(update_memlimit, memlimit_active, memlimit_inactive)
1531#pragma unused(memlimit_active_is_fatal, memlimit_inactive_is_fatal)
1532#endif /* !CONFIG_JETSAM */
1533
1534 MEMORYSTATUS_DEBUG(1, "memorystatus_update: changing pid %d: priority %d, user_data 0x%llx\n", p->p_pid, priority, user_data);
316670eb 1535
39236c6e
A
1536 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_START, p->p_pid, priority, user_data, effective, 0);
1537
1538 if (priority == -1) {
1539 /* Use as shorthand for default priority */
1540 priority = JETSAM_PRIORITY_DEFAULT;
1541 } else if (priority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1542 /* JETSAM_PRIORITY_IDLE_DEFERRED is reserved for internal use; if requested, adjust to JETSAM_PRIORITY_IDLE. */
1543 priority = JETSAM_PRIORITY_IDLE;
fe8ab488
A
1544 } else if (priority == JETSAM_PRIORITY_IDLE_HEAD) {
1545 /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle queue */
1546 priority = JETSAM_PRIORITY_IDLE;
3e170ce0 1547 head_insert = TRUE;
39236c6e
A
1548 } else if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) {
1549 /* Sanity check */
1550 ret = EINVAL;
1551 goto out;
316670eb 1552 }
3e170ce0 1553
39236c6e
A
1554 proc_list_lock();
1555
1556 assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
316670eb 1557
39236c6e
A
1558 if (effective && (p->p_memstat_state & P_MEMSTAT_PRIORITYUPDATED)) {
1559 ret = EALREADY;
1560 proc_list_unlock();
fe8ab488
A
1561 MEMORYSTATUS_DEBUG(1, "memorystatus_update: effective change specified for pid %d, but change already occurred.\n", p->p_pid);
1562 goto out;
1563 }
1564
1565 if ((p->p_memstat_state & P_MEMSTAT_TERMINATED) || ((p->p_listflag & P_LIST_EXITED) != 0)) {
1566 /*
1567 * This could happen when a process calling posix_spawn() is exiting on the jetsam thread.
1568 */
1569 ret = EBUSY;
1570 proc_list_unlock();
316670eb
A
1571 goto out;
1572 }
1573
39236c6e
A
1574 p->p_memstat_state |= P_MEMSTAT_PRIORITYUPDATED;
1575 p->p_memstat_userdata = user_data;
1576 p->p_memstat_requestedpriority = priority;
1577
1578#if CONFIG_JETSAM
1579 if (update_memlimit) {
3e170ce0
A
1580 boolean_t trigger_exception;
1581
1582 /*
1583 * Posix_spawn'd processes come through this path to instantiate ledger limits.
1584 * Forked processes do not come through this path, so no ledger limits exist.
1585 * (That's why forked processes can consume unlimited memory.)
1586 */
1587
1588 MEMORYSTATUS_DEBUG(3, "memorystatus_update(enter): pid %d, priority %d, dirty=0x%x, Active(%dMB %s), Inactive(%dMB, %s)\n",
1589 p->p_pid, priority, p->p_memstat_dirty,
1590 memlimit_active, (memlimit_active_is_fatal ? "F " : "NF"),
1591 memlimit_inactive, (memlimit_inactive_is_fatal ? "F " : "NF"));
1592
39236c6e 1593 if (memlimit_background) {
fe8ab488 1594
3e170ce0
A
1595 /*
1596 * With 2-level HWM support, we no longer honor P_MEMSTAT_MEMLIMIT_BACKGROUND.
1597 * Background limits are described via the inactive limit slots.
1598 */
1599
1600 // p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_BACKGROUND;
fe8ab488 1601
3e170ce0
A
1602#if DEVELOPMENT || DEBUG
1603 printf("memorystatus_update: WARNING %s[%d] set unused flag P_MEMSTAT_MEMLIMIT_BACKGROUND [A==%dMB %s] [IA==%dMB %s]\n",
1604 (p->p_comm ? p->p_comm : "unknown"), p->p_pid,
1605 memlimit_active, (memlimit_active_is_fatal ? "F " : "NF"),
1606 memlimit_inactive, (memlimit_inactive_is_fatal ? "F " : "NF"));
1607#endif /* DEVELOPMENT || DEBUG */
1608 }
1609
1610 if (memlimit_active <= 0) {
1611 /*
1612 * This process will have a system_wide task limit when active.
1613 * System_wide task limit is always fatal.
1614 * It's quite common to see non-fatal flag passed in here.
1615 * It's not an error, we just ignore it.
1616 */
1617
1618 /*
1619 * For backward compatibility with some unexplained launchd behavior,
1620 * we allow a zero sized limit. But we still enforce system_wide limit
1621 * when written to the ledgers.
1622 */
1623
1624 if (memlimit_active < 0) {
1625 memlimit_active = -1; /* enforces system_wide task limit */
39236c6e 1626 }
3e170ce0 1627 memlimit_active_is_fatal = TRUE;
316670eb 1628 }
3e170ce0
A
1629
1630 if (memlimit_inactive <= 0) {
1631 /*
1632 * This process will have a system_wide task limit when inactive.
1633 * System_wide task limit is always fatal.
1634 */
1635
1636 memlimit_inactive = -1;
1637 memlimit_inactive_is_fatal = TRUE;
fe8ab488 1638 }
316670eb 1639
3e170ce0
A
1640 /*
1641 * Initialize the active limit variants for this process.
1642 */
1643 SET_ACTIVE_LIMITS_LOCKED(p, memlimit_active, memlimit_active_is_fatal);
1644
1645 /*
1646 * Initialize the inactive limit variants for this process.
1647 */
1648 SET_INACTIVE_LIMITS_LOCKED(p, memlimit_inactive, memlimit_inactive_is_fatal);
1649
1650 /*
1651 * Initialize the cached limits for target process.
1652 * When the target process is dirty tracked, it's typically
1653 * in a clean state. Non dirty tracked processes are
1654 * typically active (Foreground or above).
1655 * But just in case, we don't make assumptions...
1656 */
1657
1658 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
1659 CACHE_ACTIVE_LIMITS_LOCKED(p, trigger_exception);
1660 } else {
1661 CACHE_INACTIVE_LIMITS_LOCKED(p, trigger_exception);
1662 }
1663
1664 /*
1665 * Enforce the cached limit by writing to the ledger.
1666 */
1667 if (memorystatus_highwater_enabled) {
1668 /* apply now */
1669 assert(trigger_exception == TRUE);
1670 task_set_phys_footprint_limit_internal(p->task, ((p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1), NULL, trigger_exception);
1671
1672 MEMORYSTATUS_DEBUG(3, "memorystatus_update: init: limit on pid %d (%dMB %s) targeting priority(%d) dirty?=0x%x %s\n",
1673 p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
1674 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), priority, p->p_memstat_dirty,
1675 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
1676 }
1677 }
1678#endif /* CONFIG_JETSAM */
1679
1680 /*
1681 * We can't add to the JETSAM_PRIORITY_IDLE_DEFERRED bucket here.
1682 * But, we could be removing it from the bucket.
1683 * Check and take appropriate steps if so.
1684 */
1685
1686 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1687
fe8ab488
A
1688 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1689 }
1690
1691 memorystatus_update_priority_locked(p, priority, head_insert);
39236c6e
A
1692
1693 proc_list_unlock();
1694 ret = 0;
316670eb
A
1695
1696out:
39236c6e
A
1697 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_END, ret, 0, 0, 0, 0);
1698
316670eb
A
1699 return ret;
1700}
1701
39236c6e
A
1702int
1703memorystatus_remove(proc_t p, boolean_t locked)
316670eb 1704{
39236c6e
A
1705 int ret;
1706 memstat_bucket_t *bucket;
316670eb 1707
3e170ce0 1708 MEMORYSTATUS_DEBUG(1, "memorystatus_list_remove: removing pid %d\n", p->p_pid);
316670eb 1709
39236c6e
A
1710 if (!locked) {
1711 proc_list_lock();
1712 }
316670eb 1713
39236c6e 1714 assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
fe8ab488 1715
39236c6e 1716 bucket = &memstat_bucket[p->p_memstat_effectivepriority];
fe8ab488
A
1717 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1718 assert(bucket->count == memorystatus_scheduled_idle_demotions);
1719 }
1720
39236c6e
A
1721 TAILQ_REMOVE(&bucket->list, p, p_memstat_list);
1722 bucket->count--;
1723
1724 memorystatus_list_count--;
316670eb 1725
39236c6e
A
1726 /* If awaiting demotion to the idle band, clean up */
1727 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1728 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1729 memorystatus_reschedule_idle_demotion_locked();
1730 }
316670eb 1731
39236c6e
A
1732 memorystatus_check_levels_locked();
1733
1734#if CONFIG_FREEZE
1735 if (p->p_memstat_state & (P_MEMSTAT_FROZEN)) {
1736 memorystatus_frozen_count--;
1737 }
316670eb 1738
39236c6e
A
1739 if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
1740 memorystatus_suspended_footprint_total -= p->p_memstat_suspendedfootprint;
1741 memorystatus_suspended_count--;
316670eb 1742 }
39236c6e
A
1743#endif
1744
1745 if (!locked) {
1746 proc_list_unlock();
1747 }
316670eb 1748
39236c6e
A
1749 if (p) {
1750 ret = 0;
316670eb 1751 } else {
39236c6e 1752 ret = ESRCH;
316670eb
A
1753 }
1754
1755 return ret;
1756}
1757
3e170ce0
A
1758/*
1759 * Validate dirty tracking flags with process state.
1760 *
1761 * Return:
1762 * 0 on success
1763 * non-0 on failure
1764 */
1765
1766static int
39236c6e
A
1767memorystatus_validate_track_flags(struct proc *target_p, uint32_t pcontrol) {
1768 /* See that the process isn't marked for termination */
1769 if (target_p->p_memstat_dirty & P_DIRTY_TERMINATED) {
3e170ce0 1770 return EBUSY;
316670eb
A
1771 }
1772
39236c6e
A
1773 /* Idle exit requires that process be tracked */
1774 if ((pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) &&
1775 !(pcontrol & PROC_DIRTY_TRACK)) {
3e170ce0 1776 return EINVAL;
39236c6e
A
1777 }
1778
fe8ab488
A
1779 /* 'Launch in progress' tracking requires that process have enabled dirty tracking too. */
1780 if ((pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) &&
1781 !(pcontrol & PROC_DIRTY_TRACK)) {
3e170ce0 1782 return EINVAL;
fe8ab488
A
1783 }
1784
39236c6e
A
1785 /* Deferral is only relevant if idle exit is specified */
1786 if ((pcontrol & PROC_DIRTY_DEFER) &&
1787 !(pcontrol & PROC_DIRTY_ALLOWS_IDLE_EXIT)) {
3e170ce0 1788 return EINVAL;
316670eb
A
1789 }
1790
3e170ce0 1791 return(0);
316670eb 1792}
593a1d5f 1793
39236c6e
A
1794static void
1795memorystatus_update_idle_priority_locked(proc_t p) {
1796 int32_t priority;
3e170ce0 1797
39236c6e
A
1798 MEMORYSTATUS_DEBUG(1, "memorystatus_update_idle_priority_locked(): pid %d dirty 0x%X\n", p->p_pid, p->p_memstat_dirty);
1799
1800 if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED) {
1801 priority = (p->p_memstat_dirty & P_DIRTY_DEFER_IN_PROGRESS) ? JETSAM_PRIORITY_IDLE_DEFERRED : JETSAM_PRIORITY_IDLE;
1802 } else {
1803 priority = p->p_memstat_requestedpriority;
1804 }
1805
fe8ab488
A
1806 if (priority != p->p_memstat_effectivepriority) {
1807 memorystatus_update_priority_locked(p, priority, false);
1808 }
39236c6e
A
1809}
1810
1811/*
1812 * Processes can opt to have their state tracked by the kernel, indicating when they are busy (dirty) or idle
1813 * (clean). They may also indicate that they support termination when idle, with the result that they are promoted
1814 * to their desired, higher, jetsam priority when dirty (and are therefore killed later), and demoted to the low
1815 * priority idle band when clean (and killed earlier, protecting higher priority procesess).
1816 *
1817 * If the deferral flag is set, then newly tracked processes will be protected for an initial period (as determined by
1818 * memorystatus_idle_delay_time); if they go clean during this time, then they will be moved to a deferred-idle band
1819 * with a slightly higher priority, guarding against immediate termination under memory pressure and being unable to
1820 * make forward progress. Finally, when the guard expires, they will be moved to the standard, lowest-priority, idle
1821 * band. The deferral can be cleared early by clearing the appropriate flag.
1822 *
1823 * The deferral timer is active only for the duration that the process is marked as guarded and clean; if the process
1824 * is marked dirty, the timer will be cancelled. Upon being subsequently marked clean, the deferment will either be
1825 * re-enabled or the guard state cleared, depending on whether the guard deadline has passed.
1826 */
1827
1828int
1829memorystatus_dirty_track(proc_t p, uint32_t pcontrol) {
1830 unsigned int old_dirty;
1831 boolean_t reschedule = FALSE;
fe8ab488
A
1832 boolean_t already_deferred = FALSE;
1833 boolean_t defer_now = FALSE;
3e170ce0 1834 int ret = 0;
39236c6e 1835
fe8ab488
A
1836 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_TRACK),
1837 p->p_pid, p->p_memstat_dirty, pcontrol, 0, 0);
1838
39236c6e 1839 proc_list_lock();
316670eb 1840
fe8ab488
A
1841 if ((p->p_listflag & P_LIST_EXITED) != 0) {
1842 /*
1843 * Process is on its way out.
1844 */
1845 ret = EBUSY;
1846 goto exit;
1847 }
1848
39236c6e
A
1849 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
1850 ret = EPERM;
1851 goto exit;
316670eb
A
1852 }
1853
3e170ce0
A
1854 if ((ret = memorystatus_validate_track_flags(p, pcontrol)) != 0) {
1855 /* error */
39236c6e 1856 goto exit;
3e170ce0 1857 }
39236c6e
A
1858
1859 old_dirty = p->p_memstat_dirty;
1860
1861 /* These bits are cumulative, as per <rdar://problem/11159924> */
1862 if (pcontrol & PROC_DIRTY_TRACK) {
1863 p->p_memstat_dirty |= P_DIRTY_TRACK;
1864 }
1865
1866 if (pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) {
1867 p->p_memstat_dirty |= P_DIRTY_ALLOW_IDLE_EXIT;
1868 }
1869
fe8ab488
A
1870 if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
1871 p->p_memstat_dirty |= P_DIRTY_LAUNCH_IN_PROGRESS;
1872 }
1873
1874 if (old_dirty & P_DIRTY_DEFER_IN_PROGRESS) {
1875 already_deferred = TRUE;
1876 }
1877
39236c6e 1878 /* This can be set and cleared exactly once. */
fe8ab488
A
1879 if (pcontrol & PROC_DIRTY_DEFER) {
1880
1881 if ( !(old_dirty & P_DIRTY_DEFER)) {
1882 p->p_memstat_dirty |= P_DIRTY_DEFER;
1883 }
1884
1885 defer_now = TRUE;
39236c6e
A
1886 }
1887
3e170ce0 1888 MEMORYSTATUS_DEBUG(1, "memorystatus_on_track_dirty(): set idle-exit %s / defer %s / dirty %s for pid %d\n",
39236c6e 1889 ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) ? "Y" : "N",
fe8ab488 1890 defer_now ? "Y" : "N",
39236c6e
A
1891 p->p_memstat_dirty & P_DIRTY ? "Y" : "N",
1892 p->p_pid);
1893
1894 /* Kick off or invalidate the idle exit deferment if there's a state transition. */
1895 if (!(p->p_memstat_dirty & P_DIRTY_IS_DIRTY)) {
1896 if (((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) &&
fe8ab488
A
1897 defer_now && !already_deferred) {
1898
1899 /*
1900 * Request to defer a clean process that's idle-exit enabled
1901 * and not already in the jetsam deferred band.
1902 */
39236c6e
A
1903 memorystatus_schedule_idle_demotion_locked(p, TRUE);
1904 reschedule = TRUE;
fe8ab488
A
1905
1906 } else if (!defer_now && already_deferred) {
1907
1908 /*
1909 * Either the process is no longer idle-exit enabled OR
1910 * there's a request to cancel a currently active deferral.
1911 */
1912 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1913 reschedule = TRUE;
1914 }
1915 } else {
1916
1917 /*
1918 * We are trying to operate on a dirty process. Dirty processes have to
1919 * be removed from the deferred band. The question is do we reset the
1920 * deferred state or not?
1921 *
1922 * This could be a legal request like:
1923 * - this process had opted into the JETSAM_DEFERRED band
1924 * - but it's now dirty and requests to opt out.
1925 * In this case, we remove the process from the band and reset its
1926 * state too. It'll opt back in properly when needed.
1927 *
1928 * OR, this request could be a user-space bug. E.g.:
1929 * - this process had opted into the JETSAM_DEFERRED band when clean
1930 * - and, then issues another request to again put it into the band except
1931 * this time the process is dirty.
1932 * The process going dirty, as a transition in memorystatus_dirty_set(), will pull the process out of
1933 * the deferred band with its state intact. So our request below is no-op.
1934 * But we do it here anyways for coverage.
1935 *
1936 * memorystatus_update_idle_priority_locked()
1937 * single-mindedly treats a dirty process as "cannot be in the deferred band".
1938 */
1939
1940 if (!defer_now && already_deferred) {
39236c6e
A
1941 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1942 reschedule = TRUE;
fe8ab488
A
1943 } else {
1944 memorystatus_invalidate_idle_demotion_locked(p, FALSE);
1945 reschedule = TRUE;
316670eb
A
1946 }
1947 }
39236c6e
A
1948
1949 memorystatus_update_idle_priority_locked(p);
1950
1951 if (reschedule) {
1952 memorystatus_reschedule_idle_demotion_locked();
1953 }
1954
1955 ret = 0;
316670eb 1956
39236c6e
A
1957exit:
1958 proc_list_unlock();
316670eb
A
1959
1960 return ret;
1961}
2d21ac55 1962
39236c6e
A
1963int
1964memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) {
1965 int ret;
1966 boolean_t kill = false;
1967 boolean_t reschedule = FALSE;
1968 boolean_t was_dirty = FALSE;
1969 boolean_t now_dirty = FALSE;
6d2010ae 1970
39236c6e 1971 MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_set(): %d %d 0x%x 0x%x\n", self, p->p_pid, pcontrol, p->p_memstat_dirty);
fe8ab488
A
1972
1973 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_SET), p->p_pid, self, pcontrol, 0, 0);
b0d623f7 1974
39236c6e
A
1975 proc_list_lock();
1976
fe8ab488
A
1977 if ((p->p_listflag & P_LIST_EXITED) != 0) {
1978 /*
1979 * Process is on its way out.
1980 */
1981 ret = EBUSY;
1982 goto exit;
1983 }
1984
39236c6e
A
1985 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
1986 ret = EPERM;
1987 goto exit;
1988 }
1989
1990 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY)
1991 was_dirty = TRUE;
1992
1993 if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
1994 /* Dirty tracking not enabled */
1995 ret = EINVAL;
1996 } else if (pcontrol && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
1997 /*
1998 * Process is set to be terminated and we're attempting to mark it dirty.
1999 * Set for termination and marking as clean is OK - see <rdar://problem/10594349>.
2000 */
2001 ret = EBUSY;
2002 } else {
2003 int flag = (self == TRUE) ? P_DIRTY : P_DIRTY_SHUTDOWN;
2004 if (pcontrol && !(p->p_memstat_dirty & flag)) {
2005 /* Mark the process as having been dirtied at some point */
2006 p->p_memstat_dirty |= (flag | P_DIRTY_MARKED);
2007 memorystatus_dirty_count++;
2008 ret = 0;
2009 } else if ((pcontrol == 0) && (p->p_memstat_dirty & flag)) {
3e170ce0 2010 if ((flag == P_DIRTY_SHUTDOWN) && (!(p->p_memstat_dirty & P_DIRTY))) {
39236c6e
A
2011 /* Clearing the dirty shutdown flag, and the process is otherwise clean - kill */
2012 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
2013 kill = true;
2014 } else if ((flag == P_DIRTY) && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
2015 /* Kill previously terminated processes if set clean */
2016 kill = true;
2017 }
2018 p->p_memstat_dirty &= ~flag;
2019 memorystatus_dirty_count--;
2020 ret = 0;
2021 } else {
2022 /* Already set */
2023 ret = EALREADY;
316670eb 2024 }
39236c6e
A
2025 }
2026
2027 if (ret != 0) {
2028 goto exit;
2029 }
3e170ce0 2030
39236c6e
A
2031 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY)
2032 now_dirty = TRUE;
2033
2034 if ((was_dirty == TRUE && now_dirty == FALSE) ||
2035 (was_dirty == FALSE && now_dirty == TRUE)) {
2036
2037 /* Manage idle exit deferral, if applied */
2038 if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_DEFER_IN_PROGRESS)) ==
2039 (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_DEFER_IN_PROGRESS)) {
fe8ab488
A
2040
2041 /*
2042 * P_DIRTY_DEFER_IN_PROGRESS means the process is in the deferred band OR it might be heading back
2043 * there once it's clean again and has some protection window left.
2044 */
2045
39236c6e 2046 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
fe8ab488
A
2047 /*
2048 * New dirty process i.e. "was_dirty == FALSE && now_dirty == TRUE"
2049 *
2050 * The process will move from the deferred band to its higher requested
2051 * jetsam band. But we don't clear its state i.e. we want to remember that
2052 * this process was part of the "deferred" band and will return to it.
2053 *
2054 * This way, we don't let it age beyond the protection
2055 * window when it returns to "clean". All the while giving
2056 * it a chance to perform its work while "dirty".
2057 *
2058 */
39236c6e
A
2059 memorystatus_invalidate_idle_demotion_locked(p, FALSE);
2060 reschedule = TRUE;
2061 } else {
fe8ab488
A
2062
2063 /*
2064 * Process is back from "dirty" to "clean".
2065 *
2066 * Is its timer up OR does it still have some protection
2067 * window left?
2068 */
2069
39236c6e 2070 if (mach_absolute_time() >= p->p_memstat_idledeadline) {
fe8ab488
A
2071 /*
2072 * The process' deadline has expired. It currently
2073 * does not reside in the DEFERRED bucket.
2074 *
2075 * It's on its way to the JETSAM_PRIORITY_IDLE
2076 * bucket via memorystatus_update_idle_priority_locked()
2077 * below.
2078
2079 * So all we need to do is reset all the state on the
2080 * process that's related to the DEFERRED bucket i.e.
2081 * the DIRTY_DEFER_IN_PROGRESS flag and the timer deadline.
2082 *
2083 */
2084
2085 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
2086 reschedule = TRUE;
39236c6e 2087 } else {
fe8ab488
A
2088 /*
2089 * It still has some protection window left and so
2090 * we just re-arm the timer without modifying any
2091 * state on the process.
2092 */
39236c6e
A
2093 memorystatus_schedule_idle_demotion_locked(p, FALSE);
2094 reschedule = TRUE;
2095 }
2096 }
2097 }
3e170ce0 2098
39236c6e 2099 memorystatus_update_idle_priority_locked(p);
3e170ce0
A
2100
2101#if CONFIG_JETSAM
2102 if (memorystatus_highwater_enabled) {
2103 boolean_t trigger_exception;
2104 /*
2105 * We are in this path because this process transitioned between
2106 * dirty <--> clean state. Update the cached memory limits.
2107 */
2108
2109 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
2110 /*
2111 * process is dirty
2112 */
2113 CACHE_ACTIVE_LIMITS_LOCKED(p, trigger_exception);
2114 } else {
2115 /*
2116 * process is clean
2117 */
2118 CACHE_INACTIVE_LIMITS_LOCKED(p, trigger_exception);
2119 }
2120
2121 /*
2122 * Enforce the new limits by writing to the ledger.
2123 *
2124 * This is a hot path and holding the proc_list_lock while writing to the ledgers,
2125 * (where the task lock is taken) is bad. So, we temporarily drop the proc_list_lock.
2126 * We aren't traversing the jetsam bucket list here, so we should be safe.
2127 * See rdar://21394491.
2128 */
2129
2130 if (proc_ref_locked(p) == p) {
2131 int ledger_limit;
2132 if (p->p_memstat_memlimit > 0) {
2133 ledger_limit = p->p_memstat_memlimit;
2134 } else {
2135 ledger_limit = -1;
2136 }
2137 proc_list_unlock();
2138 task_set_phys_footprint_limit_internal(p->task, ledger_limit, NULL, trigger_exception);
2139 proc_list_lock();
2140 proc_rele_locked(p);
2141
2142 MEMORYSTATUS_DEBUG(3, "memorystatus_dirty_set: new limit on pid %d (%dMB %s) priority(%d) dirty?=0x%x %s\n",
2143 p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
2144 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, p->p_memstat_dirty,
2145 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
2146 }
2147
2148 }
2149#endif /* CONFIG_JETSAM */
39236c6e
A
2150
2151 /* If the deferral state changed, reschedule the demotion timer */
2152 if (reschedule) {
2153 memorystatus_reschedule_idle_demotion_locked();
2154 }
2155 }
3e170ce0 2156
39236c6e 2157 if (kill) {
3e170ce0
A
2158 if (proc_ref_locked(p) == p) {
2159 proc_list_unlock();
2160 psignal(p, SIGKILL);
2161 proc_list_lock();
2162 proc_rele_locked(p);
2163 }
39236c6e
A
2164 }
2165
2166exit:
2167 proc_list_unlock();
2168
2169 return ret;
2170}
b0d623f7 2171
39236c6e 2172int
fe8ab488
A
2173memorystatus_dirty_clear(proc_t p, uint32_t pcontrol) {
2174
39236c6e 2175 int ret = 0;
fe8ab488
A
2176
2177 MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_clear(): %d 0x%x 0x%x\n", p->p_pid, pcontrol, p->p_memstat_dirty);
39236c6e 2178
fe8ab488
A
2179 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_CLEAR), p->p_pid, pcontrol, 0, 0, 0);
2180
2181 proc_list_lock();
2182
2183 if ((p->p_listflag & P_LIST_EXITED) != 0) {
2184 /*
2185 * Process is on its way out.
2186 */
2187 ret = EBUSY;
2188 goto exit;
2189 }
2190
2191 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
2192 ret = EPERM;
2193 goto exit;
2194 }
2195
2196 if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
2197 /* Dirty tracking not enabled */
2198 ret = EINVAL;
2199 goto exit;
2200 }
2201
2202 if (!pcontrol || (pcontrol & (PROC_DIRTY_LAUNCH_IN_PROGRESS | PROC_DIRTY_DEFER)) == 0) {
2203 ret = EINVAL;
2204 goto exit;
2205 }
2206
2207 if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
2208 p->p_memstat_dirty &= ~P_DIRTY_LAUNCH_IN_PROGRESS;
2209 }
2210
2211 /* This can be set and cleared exactly once. */
2212 if (pcontrol & PROC_DIRTY_DEFER) {
2213
2214 if (p->p_memstat_dirty & P_DIRTY_DEFER) {
2215
2216 p->p_memstat_dirty &= ~P_DIRTY_DEFER;
2217
2218 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
2219 memorystatus_update_idle_priority_locked(p);
2220 memorystatus_reschedule_idle_demotion_locked();
2221 }
2222 }
2223
2224 ret = 0;
2225exit:
2226 proc_list_unlock();
2227
2228 return ret;
2229}
2230
2231int
2232memorystatus_dirty_get(proc_t p) {
2233 int ret = 0;
2234
2235 proc_list_lock();
2236
2237 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
39236c6e
A
2238 ret |= PROC_DIRTY_TRACKED;
2239 if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) {
2240 ret |= PROC_DIRTY_ALLOWS_IDLE_EXIT;
2241 }
2242 if (p->p_memstat_dirty & P_DIRTY) {
2243 ret |= PROC_DIRTY_IS_DIRTY;
2244 }
fe8ab488
A
2245 if (p->p_memstat_dirty & P_DIRTY_LAUNCH_IN_PROGRESS) {
2246 ret |= PROC_DIRTY_LAUNCH_IS_IN_PROGRESS;
2247 }
39236c6e
A
2248 }
2249
2250 proc_list_unlock();
2251
2252 return ret;
2253}
b0d623f7 2254
39236c6e
A
2255int
2256memorystatus_on_terminate(proc_t p) {
2257 int sig;
2258
2259 proc_list_lock();
2260
2261 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
2262
2263 if ((p->p_memstat_dirty & (P_DIRTY_TRACK|P_DIRTY_IS_DIRTY)) == P_DIRTY_TRACK) {
2264 /* Clean; mark as terminated and issue SIGKILL */
2265 sig = SIGKILL;
2266 } else {
2267 /* Dirty, terminated, or state tracking is unsupported; issue SIGTERM to allow cleanup */
2268 sig = SIGTERM;
316670eb 2269 }
39236c6e
A
2270
2271 proc_list_unlock();
2272
2273 return sig;
316670eb 2274}
b0d623f7 2275
316670eb 2276void
39236c6e
A
2277memorystatus_on_suspend(proc_t p)
2278{
316670eb 2279#if CONFIG_FREEZE
39236c6e 2280 uint32_t pages;
fe8ab488 2281 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
39236c6e
A
2282#endif
2283 proc_list_lock();
2284#if CONFIG_FREEZE
2285 p->p_memstat_suspendedfootprint = pages;
2286 memorystatus_suspended_footprint_total += pages;
2287 memorystatus_suspended_count++;
316670eb 2288#endif
39236c6e
A
2289 p->p_memstat_state |= P_MEMSTAT_SUSPENDED;
2290 proc_list_unlock();
2291}
6d2010ae 2292
39236c6e
A
2293void
2294memorystatus_on_resume(proc_t p)
2295{
2296#if CONFIG_FREEZE
2297 boolean_t frozen;
2298 pid_t pid;
2299#endif
6d2010ae 2300
39236c6e 2301 proc_list_lock();
6d2010ae 2302
316670eb 2303#if CONFIG_FREEZE
39236c6e
A
2304 frozen = (p->p_memstat_state & P_MEMSTAT_FROZEN);
2305 if (frozen) {
2306 memorystatus_frozen_count--;
2307 p->p_memstat_state |= P_MEMSTAT_PRIOR_THAW;
2308 }
2309
2310 memorystatus_suspended_footprint_total -= p->p_memstat_suspendedfootprint;
2311 memorystatus_suspended_count--;
2312
2313 pid = p->p_pid;
316670eb 2314#endif
39236c6e
A
2315
2316 p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN);
2317
2318 proc_list_unlock();
2319
2320#if CONFIG_FREEZE
2321 if (frozen) {
2322 memorystatus_freeze_entry_t data = { pid, FALSE, 0 };
2323 memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
316670eb 2324 }
39236c6e 2325#endif
316670eb 2326}
6d2010ae 2327
316670eb 2328void
39236c6e 2329memorystatus_on_inactivity(proc_t p)
6d2010ae 2330{
39236c6e 2331#pragma unused(p)
316670eb
A
2332#if CONFIG_FREEZE
2333 /* Wake the freeze thread */
2334 thread_wakeup((event_t)&memorystatus_freeze_wakeup);
2335#endif
2336}
6d2010ae 2337
39236c6e
A
2338static uint32_t
2339memorystatus_build_state(proc_t p) {
2340 uint32_t snapshot_state = 0;
2341
2342 /* General */
2343 if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
2344 snapshot_state |= kMemorystatusSuspended;
2345 }
2346 if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
2347 snapshot_state |= kMemorystatusFrozen;
2348 }
2349 if (p->p_memstat_state & P_MEMSTAT_PRIOR_THAW) {
2350 snapshot_state |= kMemorystatusWasThawed;
2351 }
2352
2353 /* Tracking */
2354 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
2355 snapshot_state |= kMemorystatusTracked;
2356 }
2357 if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
2358 snapshot_state |= kMemorystatusSupportsIdleExit;
2359 }
2360 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
2361 snapshot_state |= kMemorystatusDirty;
2362 }
2363
2364 return snapshot_state;
2365}
2366
2367#if !CONFIG_JETSAM
2368
2369static boolean_t
2370kill_idle_exit_proc(void)
316670eb 2371{
39236c6e 2372 proc_t p, victim_p = PROC_NULL;
316670eb 2373 uint64_t current_time;
39236c6e
A
2374 boolean_t killed = FALSE;
2375 unsigned int i = 0;
316670eb 2376
39236c6e 2377 /* Pick next idle exit victim. */
316670eb 2378 current_time = mach_absolute_time();
6d2010ae 2379
39236c6e 2380 proc_list_lock();
6d2010ae 2381
39236c6e
A
2382 p = memorystatus_get_first_proc_locked(&i, FALSE);
2383 while (p) {
2384 /* No need to look beyond the idle band */
2385 if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
2386 break;
2387 }
2388
2389 if ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT|P_DIRTY_IS_DIRTY|P_DIRTY_TERMINATED)) == (P_DIRTY_ALLOW_IDLE_EXIT)) {
2390 if (current_time >= p->p_memstat_idledeadline) {
2391 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
2392 victim_p = proc_ref_locked(p);
2393 break;
316670eb
A
2394 }
2395 }
39236c6e
A
2396
2397 p = memorystatus_get_next_proc_locked(&i, p, FALSE);
6d2010ae 2398 }
316670eb 2399
39236c6e
A
2400 proc_list_unlock();
2401
2402 if (victim_p) {
2403 printf("memorystatus_thread: idle exiting pid %d [%s]\n", victim_p->p_pid, (victim_p->p_comm ? victim_p->p_comm : "(unknown)"));
2404 killed = memorystatus_do_kill(victim_p, kMemorystatusKilledIdleExit);
2405 proc_rele(victim_p);
316670eb 2406 }
b0d623f7 2407
39236c6e 2408 return killed;
2d21ac55 2409}
39236c6e 2410#endif
2d21ac55 2411
fe8ab488 2412#if CONFIG_JETSAM
39236c6e
A
2413static void
2414memorystatus_thread_wake(void) {
2415 thread_wakeup((event_t)&memorystatus_wakeup);
b0d623f7 2416}
fe8ab488
A
2417#endif /* CONFIG_JETSAM */
2418
2419extern void vm_pressure_response(void);
b0d623f7 2420
316670eb 2421static int
39236c6e
A
2422memorystatus_thread_block(uint32_t interval_ms, thread_continue_t continuation)
2423{
2424 if (interval_ms) {
2425 assert_wait_timeout(&memorystatus_wakeup, THREAD_UNINT, interval_ms, 1000 * NSEC_PER_USEC);
2426 } else {
2427 assert_wait(&memorystatus_wakeup, THREAD_UNINT);
2428 }
316670eb 2429
39236c6e
A
2430 return thread_block(continuation);
2431}
316670eb 2432
39236c6e
A
2433static void
2434memorystatus_thread(void *param __unused, wait_result_t wr __unused)
2435{
2436 static boolean_t is_vm_privileged = FALSE;
3e170ce0 2437
39236c6e
A
2438#if CONFIG_JETSAM
2439 boolean_t post_snapshot = FALSE;
2440 uint32_t errors = 0;
fe8ab488 2441 uint32_t hwm_kill = 0;
3e170ce0
A
2442 boolean_t sort_flag = TRUE;
2443
2444 /* Jetsam Loop Detection - locals */
2445 memstat_bucket_t *bucket;
2446 int jld_bucket_count = 0;
2447 struct timeval jld_now_tstamp = {0,0};
2448 uint64_t jld_now_msecs = 0;
2449
2450 /* Jetsam Loop Detection - statics */
2451 static uint64_t jld_timestamp_msecs = 0;
2452 static int jld_idle_kill_candidates = 0; /* Number of available processes in band 0,1 at start */
2453 static int jld_idle_kills = 0; /* Number of procs killed during eval period */
2454 static int jld_eval_aggressive_count = 0; /* Bumps the max priority in aggressive loop */
2455 static int32_t jld_priority_band_max = JETSAM_PRIORITY_UI_SUPPORT;
39236c6e 2456#endif
316670eb 2457
39236c6e
A
2458 if (is_vm_privileged == FALSE) {
2459 /*
2460 * It's the first time the thread has run, so just mark the thread as privileged and block.
2461 * This avoids a spurious pass with unset variables, as set out in <rdar://problem/9609402>.
2462 */
2463 thread_wire(host_priv_self(), current_thread(), TRUE);
2464 is_vm_privileged = TRUE;
2465
3e170ce0
A
2466 if (vm_restricted_to_single_processor == TRUE)
2467 thread_vm_bind_group_add();
2468
39236c6e 2469 memorystatus_thread_block(0, memorystatus_thread);
316670eb
A
2470 }
2471
39236c6e
A
2472#if CONFIG_JETSAM
2473
2474 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_START,
3e170ce0 2475 memorystatus_available_pages, memorystatus_jld_enabled, memorystatus_jld_eval_period_msecs, memorystatus_jld_eval_aggressive_count,0);
316670eb 2476
fe8ab488
A
2477 /*
2478 * Jetsam aware version.
2479 *
2480 * The VM pressure notification thread is working it's way through clients in parallel.
39236c6e 2481 *
fe8ab488
A
2482 * So, while the pressure notification thread is targeting processes in order of
2483 * increasing jetsam priority, we can hopefully reduce / stop it's work by killing
2484 * any processes that have exceeded their highwater mark.
39236c6e 2485 *
fe8ab488
A
2486 * If we run out of HWM processes and our available pages drops below the critical threshold, then,
2487 * we target the least recently used process in order of increasing jetsam priority (exception: the FG band).
39236c6e 2488 */
fe8ab488
A
2489 while (is_thrashing(kill_under_pressure_cause) ||
2490 memorystatus_available_pages <= memorystatus_available_pages_pressure) {
39236c6e
A
2491 boolean_t killed;
2492 int32_t priority;
fe8ab488
A
2493 uint32_t cause;
2494
2495 if (kill_under_pressure_cause) {
2496 cause = kill_under_pressure_cause;
2497 } else {
2498 cause = kMemorystatusKilledVMPageShortage;
2499 }
39236c6e
A
2500
2501#if LEGACY_HIWATER
2502 /* Highwater */
2503 killed = memorystatus_kill_hiwat_proc(&errors);
2504 if (killed) {
fe8ab488 2505 hwm_kill++;
39236c6e
A
2506 post_snapshot = TRUE;
2507 goto done;
fe8ab488
A
2508 } else {
2509 memorystatus_hwm_candidates = FALSE;
2510 }
2511
2512 /* No highwater processes to kill. Continue or stop for now? */
2513 if (!is_thrashing(kill_under_pressure_cause) &&
2514 (memorystatus_available_pages > memorystatus_available_pages_critical)) {
2515 /*
2516 * We are _not_ out of pressure but we are above the critical threshold and there's:
2517 * - no compressor thrashing
2518 * - no more HWM processes left.
2519 * For now, don't kill any other processes.
2520 */
2521
2522 if (hwm_kill == 0) {
2523 memorystatus_thread_wasted_wakeup++;
2524 }
2525
2526 break;
39236c6e
A
2527 }
2528#endif
3e170ce0
A
2529 if (memorystatus_jld_enabled == TRUE) {
2530
2531 /*
2532 * Jetsam Loop Detection: attempt to detect
2533 * rapid daemon relaunches in the lower bands.
2534 */
2535
2536 microuptime(&jld_now_tstamp);
2537
2538 /*
2539 * Ignore usecs in this calculation.
2540 * msecs granularity is close enough.
2541 */
2542 jld_now_msecs = (jld_now_tstamp.tv_sec * 1000);
2543
2544 proc_list_lock();
2545 bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
2546 jld_bucket_count = bucket->count;
2547 bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE_DEFERRED];
2548 jld_bucket_count += bucket->count;
2549 proc_list_unlock();
2550
2551 /*
2552 * memorystatus_jld_eval_period_msecs is a tunable
2553 * memorystatus_jld_eval_aggressive_count is a tunable
2554 * memorystatus_jld_eval_aggressive_priority_band_max is a tunable
2555 */
2556 if ( (jld_bucket_count == 0) ||
2557 (jld_now_msecs > (jld_timestamp_msecs + memorystatus_jld_eval_period_msecs))) {
2558
2559 /*
2560 * Refresh evaluation parameters
2561 */
2562 jld_timestamp_msecs = jld_now_msecs;
2563 jld_idle_kill_candidates = jld_bucket_count;
2564 jld_idle_kills = 0;
2565 jld_eval_aggressive_count = 0;
2566 jld_priority_band_max = JETSAM_PRIORITY_UI_SUPPORT;
2567 }
2568
2569 if (jld_idle_kills > jld_idle_kill_candidates) {
2570 jld_eval_aggressive_count++;
2571 if (jld_eval_aggressive_count > memorystatus_jld_eval_aggressive_count) {
2572 /*
2573 * Bump up the jetsam priority limit (eg: the bucket index)
2574 * Enforce bucket index sanity.
2575 */
2576 if ((memorystatus_jld_eval_aggressive_priority_band_max < 0) ||
2577 (memorystatus_jld_eval_aggressive_priority_band_max >= MEMSTAT_BUCKET_COUNT)) {
2578 /*
2579 * Do nothing. Stick with the default level.
2580 */
2581 } else {
2582 jld_priority_band_max = memorystatus_jld_eval_aggressive_priority_band_max;
2583 }
2584 }
2585
2586 killed = memorystatus_kill_top_process_aggressive(
2587 TRUE,
2588 kMemorystatusKilledVMThrashing,
2589 jld_eval_aggressive_count,
2590 jld_priority_band_max,
2591 &errors);
2592
2593
2594 if (killed) {
2595 /* Always generate logs after aggressive kill */
2596 post_snapshot = TRUE;
2597 goto done;
2598 }
2599 }
2600 }
39236c6e
A
2601
2602 /* LRU */
3e170ce0
A
2603 killed = memorystatus_kill_top_process(TRUE, sort_flag, cause, &priority, &errors);
2604 sort_flag = FALSE;
2605
39236c6e 2606 if (killed) {
3e170ce0
A
2607 /*
2608 * Don't generate logs for steady-state idle-exit kills,
2609 * unless it is overridden for debug or by the device
2610 * tree.
2611 */
fe8ab488 2612 if ((priority != JETSAM_PRIORITY_IDLE) || memorystatus_idle_snapshot) {
39236c6e
A
2613 post_snapshot = TRUE;
2614 }
3e170ce0
A
2615
2616 /* Jetsam Loop Detection */
2617 if (memorystatus_jld_enabled == TRUE) {
2618 if ((priority == JETSAM_PRIORITY_IDLE) || (priority == JETSAM_PRIORITY_IDLE_DEFERRED)) {
2619 jld_idle_kills++;
2620 } else {
2621 /*
2622 * We've reached into bands beyond idle deferred.
2623 * We make no attempt to monitor them
2624 */
2625 }
2626 }
39236c6e
A
2627 goto done;
2628 }
fe8ab488
A
2629
2630 if (memorystatus_available_pages <= memorystatus_available_pages_critical) {
2631 /* Under pressure and unable to kill a process - panic */
2632 panic("memorystatus_jetsam_thread: no victim! available pages:%d\n", memorystatus_available_pages);
2633 }
39236c6e
A
2634
2635done:
fe8ab488
A
2636
2637 /*
2638 * We do not want to over-kill when thrashing has been detected.
2639 * To avoid that, we reset the flag here and notify the
2640 * compressor.
39236c6e 2641 */
fe8ab488
A
2642 if (is_thrashing(kill_under_pressure_cause)) {
2643 kill_under_pressure_cause = 0;
2644 vm_thrashing_jetsam_done();
39236c6e 2645 }
39236c6e 2646 }
fe8ab488
A
2647
2648 kill_under_pressure_cause = 0;
2649
39236c6e
A
2650 if (errors) {
2651 memorystatus_clear_errors();
2652 }
2653
2654#if VM_PRESSURE_EVENTS
fe8ab488
A
2655 /*
2656 * LD: We used to target the foreground process first and foremost here.
2657 * Now, we target all processes, starting from the non-suspended, background
2658 * processes first. We will target foreground too.
2659 *
2660 * memorystatus_update_vm_pressure(TRUE);
2661 */
2662 //vm_pressure_response();
39236c6e
A
2663#endif
2664
2665 if (post_snapshot) {
2666 size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
2667 sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count);
3e170ce0
A
2668 uint64_t timestamp_now = mach_absolute_time();
2669 memorystatus_jetsam_snapshot->notification_time = timestamp_now;
2670 if (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
2671 timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout) {
2672 int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
2673 if (!ret) {
2674 proc_list_lock();
2675 memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
2676 proc_list_unlock();
2677 }
2678 }
39236c6e 2679 }
3e170ce0 2680
39236c6e
A
2681 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_END,
2682 memorystatus_available_pages, 0, 0, 0, 0);
2683
2684#else /* CONFIG_JETSAM */
2685
fe8ab488
A
2686 /*
2687 * Jetsam not enabled
39236c6e
A
2688 */
2689
39236c6e
A
2690#endif /* CONFIG_JETSAM */
2691
2692 memorystatus_thread_block(0, memorystatus_thread);
2693}
2694
2695#if !CONFIG_JETSAM
fe8ab488
A
2696/*
2697 * Returns TRUE:
2698 * when an idle-exitable proc was killed
2699 * Returns FALSE:
2700 * when there are no more idle-exitable procs found
2701 * when the attempt to kill an idle-exitable proc failed
2702 */
39236c6e 2703boolean_t memorystatus_idle_exit_from_VM(void) {
fe8ab488 2704 return(kill_idle_exit_proc());
39236c6e 2705}
fe8ab488 2706#endif /* !CONFIG_JETSAM */
39236c6e
A
2707
2708#if CONFIG_JETSAM
2709
2710/*
2711 * Callback invoked when allowable physical memory footprint exceeded
2712 * (dirty pages + IOKit mappings)
2713 *
2714 * This is invoked for both advisory, non-fatal per-task high watermarks,
fe8ab488 2715 * as well as the fatal task memory limits.
39236c6e
A
2716 */
2717void
2718memorystatus_on_ledger_footprint_exceeded(boolean_t warning, const int max_footprint_mb)
2719{
3e170ce0
A
2720 boolean_t is_active;
2721 boolean_t is_fatal;
2722
39236c6e 2723 proc_t p = current_proc();
fe8ab488 2724
3e170ce0
A
2725 proc_list_lock();
2726
2727 is_active = proc_jetsam_state_is_active_locked(p);
2728 is_fatal = (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT);
2729
2730 if (warning == FALSE) {
2731 /*
2732 * We only want the EXC_RESOURCE to trigger once per lifetime
2733 * of the active/inactive limit state. So, here, we detect the
2734 * active/inactive state of the process and mark the
2735 * state as exception has been triggered.
2736 */
2737 if (is_active == TRUE) {
2738 /*
2739 * turn off exceptions for active state
2740 */
2741 p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_ACTIVE_EXC_TRIGGERED;
2742 } else {
2743 /*
2744 * turn off exceptions for inactive state
2745 */
2746 p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_INACTIVE_EXC_TRIGGERED;
2747 }
2748
2749 /*
2750 * Soft memory limit is a non-fatal high-water-mark
2751 * Hard memory limit is a fatal custom-task-limit or system-wide per-task memory limit.
2752 */
2753 printf("process %d (%s) exceeded physical memory footprint, the %s%sMemoryLimit of %d MB\n",
2754 p->p_pid, p->p_comm, (is_active ? "Active" : "Inactive"),
2755 (is_fatal ? "Hard" : "Soft"), max_footprint_mb);
2756
fe8ab488 2757 }
39236c6e 2758
3e170ce0
A
2759 proc_list_unlock();
2760
39236c6e
A
2761#if VM_PRESSURE_EVENTS
2762 if (warning == TRUE) {
fe8ab488 2763 if (memorystatus_warn_process(p->p_pid, TRUE /* critical? */) != TRUE) {
39236c6e 2764 /* Print warning, since it's possible that task has not registered for pressure notifications */
3e170ce0 2765 printf("task_exceeded_footprint: failed to warn the current task (exiting, or no handler registered?).\n");
39236c6e
A
2766 }
2767 return;
2768 }
2769#endif /* VM_PRESSURE_EVENTS */
2770
3e170ce0 2771 if (is_fatal) {
39236c6e 2772 /*
fe8ab488
A
2773 * If this process has no high watermark or has a fatal task limit, then we have been invoked because the task
2774 * has violated either the system-wide per-task memory limit OR its own task limit.
39236c6e
A
2775 */
2776 if (memorystatus_kill_process_sync(p->p_pid, kMemorystatusKilledPerProcessLimit) != TRUE) {
2777 printf("task_exceeded_footprint: failed to kill the current task (exiting?).\n");
2778 }
fe8ab488
A
2779 } else {
2780 /*
2781 * HWM offender exists. Done without locks or synchronization.
2782 * See comment near its declaration for more details.
2783 */
2784 memorystatus_hwm_candidates = TRUE;
2785 }
2786}
2787
3e170ce0
A
2788/*
2789 * Toggle the P_MEMSTAT_TERMINATED state.
2790 * Takes the proc_list_lock.
2791 */
2792void
2793proc_memstat_terminated(proc_t p, boolean_t set)
2794{
2795#if DEVELOPMENT || DEBUG
2796 if (p) {
2797 proc_list_lock();
2798 if (set == TRUE) {
2799 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
2800 } else {
2801 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
2802 }
2803 proc_list_unlock();
2804 }
2805#else
2806#pragma unused(p, set)
2807 /*
2808 * do nothing
2809 */
2810#endif /* DEVELOPMENT || DEBUG */
2811 return;
2812}
2813
fe8ab488
A
2814/*
2815 * This is invoked when cpulimits have been exceeded while in fatal mode.
2816 * The jetsam_flags do not apply as those are for memory related kills.
2817 * We call this routine so that the offending process is killed with
2818 * a non-zero exit status.
2819 */
2820void
2821jetsam_on_ledger_cpulimit_exceeded(void)
2822{
2823 int retval = 0;
2824 int jetsam_flags = 0; /* make it obvious */
2825 proc_t p = current_proc();
2826
2827 printf("task_exceeded_cpulimit: killing pid %d [%s]\n",
2828 p->p_pid, (p->p_comm ? p->p_comm : "(unknown)"));
2829
2830 retval = jetsam_do_kill(p, jetsam_flags);
2831
2832 if (retval) {
2833 printf("task_exceeded_cpulimit: failed to kill current task (exiting?).\n");
39236c6e
A
2834 }
2835}
2836
2837static void
fe8ab488 2838memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages)
39236c6e
A
2839{
2840 assert(task);
2841 assert(footprint);
2842
2843 *footprint = (uint32_t)(get_task_phys_footprint(task) / PAGE_SIZE_64);
2844 if (max_footprint) {
2845 *max_footprint = (uint32_t)(get_task_phys_footprint_max(task) / PAGE_SIZE_64);
2846 }
fe8ab488
A
2847 if (max_footprint_lifetime) {
2848 *max_footprint_lifetime = (uint32_t)(get_task_resident_max(task) / PAGE_SIZE_64);
2849 }
2850 if (purgeable_pages) {
2851 *purgeable_pages = (uint32_t)(get_task_purgeable_size(task) / PAGE_SIZE_64);
39236c6e 2852 }
39236c6e
A
2853}
2854
2855static void
3e170ce0 2856memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause)
39236c6e
A
2857{
2858 unsigned int i;
2859
2860 for (i = 0; i < memorystatus_jetsam_snapshot_count; i++) {
2861 if (memorystatus_jetsam_snapshot_list[i].pid == p->p_pid) {
2862 /* Update if the priority has changed since the snapshot was taken */
2863 if (memorystatus_jetsam_snapshot_list[i].priority != p->p_memstat_effectivepriority) {
2864 memorystatus_jetsam_snapshot_list[i].priority = p->p_memstat_effectivepriority;
2865 strlcpy(memorystatus_jetsam_snapshot_list[i].name, p->p_comm, MAXCOMLEN+1);
2866 memorystatus_jetsam_snapshot_list[i].state = memorystatus_build_state(p);
2867 memorystatus_jetsam_snapshot_list[i].user_data = p->p_memstat_userdata;
2868 memorystatus_jetsam_snapshot_list[i].fds = p->p_fd->fd_nfiles;
2869 }
2870 memorystatus_jetsam_snapshot_list[i].killed = kill_cause;
2871 return;
2872 }
2873 }
316670eb 2874}
b0d623f7 2875
39236c6e
A
2876void memorystatus_pages_update(unsigned int pages_avail)
2877{
fe8ab488
A
2878 memorystatus_available_pages = pages_avail;
2879
2880#if VM_PRESSURE_EVENTS
2881 /*
2882 * Since memorystatus_available_pages changes, we should
2883 * re-evaluate the pressure levels on the system and
2884 * check if we need to wake the pressure thread.
2885 * We also update memorystatus_level in that routine.
2886 */
2887 vm_pressure_response();
2888
2889 if (memorystatus_available_pages <= memorystatus_available_pages_pressure) {
2890
2891 if (memorystatus_hwm_candidates || (memorystatus_available_pages <= memorystatus_available_pages_critical)) {
2892 memorystatus_thread_wake();
2893 }
2894 }
2895#else /* VM_PRESSURE_EVENTS */
2896
39236c6e
A
2897 boolean_t critical, delta;
2898
316670eb
A
2899 if (!memorystatus_delta) {
2900 return;
2901 }
39236c6e
A
2902
2903 critical = (pages_avail < memorystatus_available_pages_critical) ? TRUE : FALSE;
2904 delta = ((pages_avail >= (memorystatus_available_pages + memorystatus_delta))
2905 || (memorystatus_available_pages >= (pages_avail + memorystatus_delta))) ? TRUE : FALSE;
2906
2907 if (critical || delta) {
39236c6e 2908 memorystatus_level = memorystatus_available_pages * 100 / atop_64(max_mem);
39236c6e 2909 memorystatus_thread_wake();
b0d623f7 2910 }
fe8ab488 2911#endif /* VM_PRESSURE_EVENTS */
316670eb
A
2912}
2913
2914static boolean_t
3e170ce0 2915memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry)
316670eb 2916{
fe8ab488
A
2917 clock_sec_t tv_sec;
2918 clock_usec_t tv_usec;
2919
39236c6e 2920 memset(entry, 0, sizeof(memorystatus_jetsam_snapshot_entry_t));
316670eb
A
2921
2922 entry->pid = p->p_pid;
2923 strlcpy(&entry->name[0], p->p_comm, MAXCOMLEN+1);
39236c6e 2924 entry->priority = p->p_memstat_effectivepriority;
fe8ab488 2925 memorystatus_get_task_page_counts(p->task, &entry->pages, &entry->max_pages, &entry->max_pages_lifetime, &entry->purgeable_pages);
39236c6e
A
2926 entry->state = memorystatus_build_state(p);
2927 entry->user_data = p->p_memstat_userdata;
316670eb 2928 memcpy(&entry->uuid[0], &p->p_uuid[0], sizeof(p->p_uuid));
fe8ab488
A
2929 entry->fds = p->p_fd->fd_nfiles;
2930
2931 absolutetime_to_microtime(get_task_cpu_time(p->task), &tv_sec, &tv_usec);
2932 entry->cpu_time.tv_sec = tv_sec;
2933 entry->cpu_time.tv_usec = tv_usec;
316670eb
A
2934
2935 return TRUE;
b0d623f7
A
2936}
2937
2938static void
3e170ce0 2939memorystatus_init_snapshot_vmstats(memorystatus_jetsam_snapshot_t *snapshot)
b0d623f7 2940{
39236c6e 2941 kern_return_t kr = KERN_SUCCESS;
39236c6e
A
2942 mach_msg_type_number_t count = HOST_VM_INFO64_COUNT;
2943 vm_statistics64_data_t vm_stat;
2944
2945 if ((kr = host_statistics64(host_self(), HOST_VM_INFO64, (host_info64_t)&vm_stat, &count) != KERN_SUCCESS)) {
3e170ce0
A
2946 printf("memorystatus_init_jetsam_snapshot_stats: host_statistics64 failed with %d\n", kr);
2947 memset(&snapshot->stats, 0, sizeof(snapshot->stats));
39236c6e 2948 } else {
3e170ce0
A
2949 snapshot->stats.free_pages = vm_stat.free_count;
2950 snapshot->stats.active_pages = vm_stat.active_count;
2951 snapshot->stats.inactive_pages = vm_stat.inactive_count;
2952 snapshot->stats.throttled_pages = vm_stat.throttled_count;
2953 snapshot->stats.purgeable_pages = vm_stat.purgeable_count;
2954 snapshot->stats.wired_pages = vm_stat.wire_count;
2955
2956 snapshot->stats.speculative_pages = vm_stat.speculative_count;
2957 snapshot->stats.filebacked_pages = vm_stat.external_page_count;
2958 snapshot->stats.anonymous_pages = vm_stat.internal_page_count;
2959 snapshot->stats.compressions = vm_stat.compressions;
2960 snapshot->stats.decompressions = vm_stat.decompressions;
2961 snapshot->stats.compressor_pages = vm_stat.compressor_page_count;
2962 snapshot->stats.total_uncompressed_pages_in_compressor = vm_stat.total_uncompressed_pages_in_compressor;
2963 }
2964}
2965
2966/*
2967 * Collect vm statistics at boot.
2968 * Called only once (see kern_exec.c)
2969 * Data can be consumed at any time.
2970 */
2971void
2972memorystatus_init_at_boot_snapshot() {
2973 memorystatus_init_snapshot_vmstats(&memorystatus_at_boot_snapshot);
2974 memorystatus_at_boot_snapshot.entry_count = 0;
2975 memorystatus_at_boot_snapshot.notification_time = 0; /* updated when consumed */
2976 memorystatus_at_boot_snapshot.snapshot_time = mach_absolute_time();
2977}
2978
2979static void
2980memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count )
2981{
2982 proc_t p, next_p;
2983 unsigned int b = 0, i = 0;
2984
2985 memorystatus_jetsam_snapshot_t *snapshot = NULL;
2986 memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
2987 unsigned int snapshot_max = 0;
2988
2989 if (od_snapshot) {
2990 /*
2991 * This is an on_demand snapshot
2992 */
2993 snapshot = od_snapshot;
2994 snapshot_list = od_snapshot->entries;
2995 snapshot_max = ods_list_count;
2996 } else {
2997 /*
2998 * This is a jetsam event snapshot
2999 */
3000 snapshot = memorystatus_jetsam_snapshot;
3001 snapshot_list = memorystatus_jetsam_snapshot->entries;
3002 snapshot_max = memorystatus_jetsam_snapshot_max;
39236c6e
A
3003 }
3004
3e170ce0
A
3005 memorystatus_init_snapshot_vmstats(snapshot);
3006
39236c6e
A
3007 next_p = memorystatus_get_first_proc_locked(&b, TRUE);
3008 while (next_p) {
3009 p = next_p;
3010 next_p = memorystatus_get_next_proc_locked(&b, p, TRUE);
3011
3e170ce0 3012 if (FALSE == memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i])) {
316670eb
A
3013 continue;
3014 }
3015
3e170ce0 3016 MEMORYSTATUS_DEBUG(0, "jetsam snapshot pid %d, uuid = %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
b0d623f7
A
3017 p->p_pid,
3018 p->p_uuid[0], p->p_uuid[1], p->p_uuid[2], p->p_uuid[3], p->p_uuid[4], p->p_uuid[5], p->p_uuid[6], p->p_uuid[7],
3019 p->p_uuid[8], p->p_uuid[9], p->p_uuid[10], p->p_uuid[11], p->p_uuid[12], p->p_uuid[13], p->p_uuid[14], p->p_uuid[15]);
316670eb 3020
3e170ce0 3021 if (++i == snapshot_max) {
b0d623f7
A
3022 break;
3023 }
3024 }
39236c6e 3025
3e170ce0
A
3026 snapshot->snapshot_time = mach_absolute_time();
3027 snapshot->entry_count = i;
3028
3029 if (!od_snapshot) {
3030 /* update the system buffer count */
3031 memorystatus_jetsam_snapshot_count = i;
3032 }
b0d623f7
A
3033}
3034
39236c6e 3035#if DEVELOPMENT || DEBUG
b0d623f7 3036
39236c6e
A
3037static int
3038memorystatus_cmd_set_panic_bits(user_addr_t buffer, uint32_t buffer_size) {
3039 int ret;
3040 memorystatus_jetsam_panic_options_t debug;
3041
3042 if (buffer_size != sizeof(memorystatus_jetsam_panic_options_t)) {
3043 return EINVAL;
b0d623f7 3044 }
39236c6e
A
3045
3046 ret = copyin(buffer, &debug, buffer_size);
3047 if (ret) {
3048 return ret;
3049 }
3050
3051 /* Panic bits match kMemorystatusKilled* enum */
3052 memorystatus_jetsam_panic_debug = (memorystatus_jetsam_panic_debug & ~debug.mask) | (debug.data & debug.mask);
3053
3054 /* Copyout new value */
3055 debug.data = memorystatus_jetsam_panic_debug;
3056 ret = copyout(&debug, buffer, sizeof(memorystatus_jetsam_panic_options_t));
3057
3058 return ret;
b0d623f7
A
3059}
3060
3e170ce0
A
3061/*
3062 * Triggers a sort_order on a specified jetsam priority band.
3063 * This is for testing only, used to force a path through the sort
3064 * function.
3065 */
3066static int
3067memorystatus_cmd_test_jetsam_sort(int priority, int sort_order) {
3068
3069 int error = 0;
3070
3071 unsigned int bucket_index = 0;
3072
3073 if (priority == -1) {
3074 /* Use as shorthand for default priority */
3075 bucket_index = JETSAM_PRIORITY_DEFAULT;
3076 } else {
3077 bucket_index = (unsigned int)priority;
3078 }
3079
3080 error = memorystatus_sort_bucket(bucket_index, sort_order);
3081
3082 return (error);
3083}
3084
39236c6e
A
3085#endif
3086
3087/*
3088 * Jetsam a specific process.
3089 */
3090static boolean_t
3091memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause) {
3092 boolean_t killed;
b0d623f7 3093 proc_t p;
39236c6e
A
3094
3095 /* TODO - add a victim queue and push this into the main jetsam thread */
39236c6e
A
3096 p = proc_find(victim_pid);
3097 if (!p) {
3098 return FALSE;
3099 }
3100
39236c6e
A
3101 proc_list_lock();
3102
4bd07ac2
A
3103 if ((p->p_memstat_state & P_MEMSTAT_TERMINATED) ||
3104 (p->p_listflag & P_LIST_EXITED) ||
3105 (p->p_memstat_state & P_MEMSTAT_ERROR)) {
3106 proc_list_unlock();
3107 proc_rele(p);
3108 return FALSE;
3109 }
3110
3111 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
3112
39236c6e 3113 if (memorystatus_jetsam_snapshot_count == 0) {
3e170ce0 3114 memorystatus_init_jetsam_snapshot_locked(NULL,0);
39236c6e
A
3115 }
3116
3e170ce0 3117 memorystatus_update_jetsam_snapshot_entry_locked(p, cause);
39236c6e 3118 proc_list_unlock();
4bd07ac2
A
3119
3120 printf("memorystatus: specifically killing pid %d [%s] (%s %d) - memorystatus_available_pages: %d\n",
3121 victim_pid, (p->p_comm ? p->p_comm : "(unknown)"),
3122 jetsam_kill_cause_name[cause], p->p_memstat_effectivepriority, memorystatus_available_pages);
3123
39236c6e
A
3124
3125 killed = memorystatus_do_kill(p, cause);
3126 proc_rele(p);
3127
3128 return killed;
3129}
3130
3131/*
3132 * Jetsam the first process in the queue.
3133 */
3134static boolean_t
3e170ce0 3135memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, int32_t *priority, uint32_t *errors)
39236c6e
A
3136{
3137 pid_t aPid;
3138 proc_t p = PROC_NULL, next_p = PROC_NULL;
3139 boolean_t new_snapshot = FALSE, killed = FALSE;
3e170ce0 3140 int kill_count = 0;
39236c6e 3141 unsigned int i = 0;
3e170ce0 3142 uint32_t aPid_ep;
b0d623f7 3143
6d2010ae
A
3144#ifndef CONFIG_FREEZE
3145#pragma unused(any)
3146#endif
316670eb 3147
39236c6e
A
3148 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
3149 memorystatus_available_pages, 0, 0, 0, 0);
6d2010ae 3150
316670eb 3151
3e170ce0
A
3152 if (sort_flag == TRUE) {
3153 (void)memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT);
3154 }
3155
3156 proc_list_lock();
fe8ab488 3157
39236c6e
A
3158 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
3159 while (next_p) {
316670eb
A
3160#if DEVELOPMENT || DEBUG
3161 int activeProcess;
3162 int procSuspendedForDiagnosis;
3163#endif /* DEVELOPMENT || DEBUG */
39236c6e
A
3164
3165 p = next_p;
3166 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
3167
6d2010ae 3168#if DEVELOPMENT || DEBUG
39236c6e
A
3169 activeProcess = p->p_memstat_state & P_MEMSTAT_FOREGROUND;
3170 procSuspendedForDiagnosis = p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED;
6d2010ae 3171#endif /* DEVELOPMENT || DEBUG */
316670eb 3172
39236c6e 3173 aPid = p->p_pid;
3e170ce0 3174 aPid_ep = p->p_memstat_effectivepriority;
316670eb 3175
39236c6e
A
3176 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
3177 continue;
b0d623f7 3178 }
39236c6e 3179
6d2010ae 3180#if DEVELOPMENT || DEBUG
39236c6e
A
3181 if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && procSuspendedForDiagnosis) {
3182 printf("jetsam: continuing after ignoring proc suspended already for diagnosis - %d\n", aPid);
3183 continue;
3184 }
6d2010ae 3185#endif /* DEVELOPMENT || DEBUG */
316670eb 3186
fe8ab488
A
3187 if (cause == kMemorystatusKilledVnodes)
3188 {
3189 /*
3190 * If the system runs out of vnodes, we systematically jetsam
3191 * processes in hopes of stumbling onto a vnode gain that helps
3192 * the system recover. The process that happens to trigger
3193 * this path has no known relationship to the vnode consumption.
3194 * We attempt to safeguard that process e.g: do not jetsam it.
3195 */
3196
3197 if (p == current_proc()) {
3198 /* do not jetsam the current process */
3199 continue;
3200 }
3201 }
3202
6d2010ae 3203#if CONFIG_FREEZE
39236c6e
A
3204 boolean_t skip;
3205 boolean_t reclaim_proc = !(p->p_memstat_state & (P_MEMSTAT_LOCKED | P_MEMSTAT_NORECLAIM));
3206 if (any || reclaim_proc) {
3207 skip = FALSE;
3208 } else {
3209 skip = TRUE;
3210 }
316670eb 3211
39236c6e
A
3212 if (skip) {
3213 continue;
3214 } else
6d2010ae 3215#endif
39236c6e 3216 {
39236c6e
A
3217 /*
3218 * Capture a snapshot if none exists and:
3219 * - priority was not requested (this is something other than an ambient kill)
3220 * - the priority was requested *and* the targeted process is not at idle priority
3221 */
3222 if ((memorystatus_jetsam_snapshot_count == 0) &&
fe8ab488 3223 (memorystatus_idle_snapshot || ((!priority) || (priority && (*priority != JETSAM_PRIORITY_IDLE))))) {
3e170ce0 3224 memorystatus_init_jetsam_snapshot_locked(NULL,0);
39236c6e
A
3225 new_snapshot = TRUE;
3226 }
3227
3228 /*
3229 * Mark as terminated so that if exit1() indicates success, but the process (for example)
3230 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
3231 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
3232 * acquisition of the proc lock.
3233 */
3234 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
3235
6d2010ae 3236#if DEVELOPMENT || DEBUG
39236c6e
A
3237 if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && activeProcess) {
3238 MEMORYSTATUS_DEBUG(1, "jetsam: suspending pid %d [%s] (active) for diagnosis - memory_status_level: %d\n",
3239 aPid, (p->p_comm ? p->p_comm: "(unknown)"), memorystatus_level);
3e170ce0 3240 memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusKilledDiagnostic);
39236c6e
A
3241 p->p_memstat_state |= P_MEMSTAT_DIAG_SUSPENDED;
3242 if (memorystatus_jetsam_policy & kPolicyDiagnoseFirst) {
3243 jetsam_diagnostic_suspended_one_active_proc = 1;
3244 printf("jetsam: returning after suspending first active proc - %d\n", aPid);
3245 }
3246
3247 p = proc_ref_locked(p);
3248 proc_list_unlock();
3249 if (p) {
316670eb 3250 task_suspend(p->task);
3e170ce0
A
3251 if (priority) {
3252 *priority = aPid_ep;
3253 }
316670eb 3254 proc_rele(p);
39236c6e
A
3255 killed = TRUE;
3256 }
3257
3258 goto exit;
3259 } else
6d2010ae 3260#endif /* DEVELOPMENT || DEBUG */
39236c6e
A
3261 {
3262 /* Shift queue, update stats */
3e170ce0
A
3263 memorystatus_update_jetsam_snapshot_entry_locked(p, cause);
3264
3265 if (proc_ref_locked(p) == p) {
3266 proc_list_unlock();
3267 printf("memorystatus: %s %d [%s] (%s %d) - memorystatus_available_pages: %d\n",
3268 ((aPid_ep == JETSAM_PRIORITY_IDLE) ?
fe8ab488
A
3269 "idle exiting pid" : "jetsam killing pid"),
3270 aPid, (p->p_comm ? p->p_comm : "(unknown)"),
3e170ce0
A
3271 jetsam_kill_cause_name[cause], aPid_ep, memorystatus_available_pages);
3272
39236c6e 3273 killed = memorystatus_do_kill(p, cause);
3e170ce0
A
3274
3275 /* Success? */
3276 if (killed) {
3277 if (priority) {
3278 *priority = aPid_ep;
3279 }
3280 proc_rele(p);
3281 kill_count++;
3282 goto exit;
3283 }
39236c6e 3284
3e170ce0
A
3285 /*
3286 * Failure - first unwind the state,
3287 * then fall through to restart the search.
3288 */
3289 proc_list_lock();
3290 proc_rele_locked(p);
3291 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
3292 p->p_memstat_state |= P_MEMSTAT_ERROR;
3293 *errors += 1;
6d2010ae 3294 }
39236c6e 3295
3e170ce0
A
3296 /*
3297 * Failure - restart the search.
3298 *
3299 * We might have raced with "p" exiting on another core, resulting in no
3300 * ref on "p". Or, we may have failed to kill "p".
3301 *
3302 * Either way, we fall thru to here, leaving the proc in the
3303 * P_MEMSTAT_TERMINATED state.
3304 *
3305 * And, we hold the the proc_list_lock at this point.
3306 */
3307
39236c6e
A
3308 i = 0;
3309 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6d2010ae 3310 }
b0d623f7 3311 }
b0d623f7 3312 }
316670eb 3313
39236c6e 3314 proc_list_unlock();
316670eb 3315
39236c6e
A
3316exit:
3317 /* Clear snapshot if freshly captured and no target was found */
3318 if (new_snapshot && !killed) {
3319 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
316670eb
A
3320 }
3321
39236c6e 3322 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
3e170ce0 3323 memorystatus_available_pages, killed ? aPid : 0, kill_count, 0, 0);
b0d623f7 3324
39236c6e 3325 return killed;
316670eb
A
3326}
3327
3e170ce0
A
3328/*
3329 * Jetsam aggressively
3330 */
39236c6e 3331static boolean_t
3e170ce0
A
3332memorystatus_kill_top_process_aggressive(boolean_t any, uint32_t cause, int aggr_count, int32_t priority_max,
3333 uint32_t *errors)
d1ecb069 3334{
3e170ce0 3335 pid_t aPid;
39236c6e
A
3336 proc_t p = PROC_NULL, next_p = PROC_NULL;
3337 boolean_t new_snapshot = FALSE, killed = FALSE;
3e170ce0 3338 int kill_count = 0;
39236c6e 3339 unsigned int i = 0;
3e170ce0
A
3340 int32_t aPid_ep = 0;
3341
3342#pragma unused(any)
3343
3344 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
3345 memorystatus_available_pages, priority_max, 0, 0, 0);
3346
39236c6e 3347 proc_list_lock();
3e170ce0 3348
39236c6e
A
3349 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
3350 while (next_p) {
3e170ce0
A
3351#if DEVELOPMENT || DEBUG
3352 int activeProcess;
3353 int procSuspendedForDiagnosis;
3354#endif /* DEVELOPMENT || DEBUG */
39236c6e 3355
3e170ce0
A
3356 if ((unsigned int)(next_p->p_memstat_effectivepriority) != i) {
3357
3358 /*
3359 * We have raced with next_p running on another core, as it has
3360 * moved to a different jetsam priority band. This means we have
3361 * lost our place in line while traversing the jetsam list. We
3362 * attempt to recover by rewinding to the beginning of the band
3363 * we were already traversing. By doing this, we do not guarantee
3364 * that no process escapes this aggressive march, but we can make
3365 * skipping an entire range of processes less likely. (PR-21069019)
3366 */
3367
3368 MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: rewinding %s moved from band %d --> %d\n",
3369 aggr_count, next_p->p_comm, i, next_p->p_memstat_effectivepriority);
3370
3371 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
3372 continue;
3373 }
3374
3375 p = next_p;
3376 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
3377
3378 if (p->p_memstat_effectivepriority > priority_max) {
3379 /*
3380 * Bail out of this killing spree if we have
3381 * reached beyond the priority_max jetsam band.
3382 * That is, we kill up to and through the
3383 * priority_max jetsam band.
3384 */
3385 proc_list_unlock();
3386 goto exit;
3387 }
3388
3389#if DEVELOPMENT || DEBUG
3390 activeProcess = p->p_memstat_state & P_MEMSTAT_FOREGROUND;
3391 procSuspendedForDiagnosis = p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED;
3392#endif /* DEVELOPMENT || DEBUG */
3393
3394 aPid = p->p_pid;
3395 aPid_ep = p->p_memstat_effectivepriority;
3396
3397 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
3398 continue;
3399 }
3400
3401#if DEVELOPMENT || DEBUG
3402 if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && procSuspendedForDiagnosis) {
3403 printf("jetsam: continuing after ignoring proc suspended already for diagnosis - %d\n", aPid);
3404 continue;
3405 }
3406#endif /* DEVELOPMENT || DEBUG */
3407
3408 /*
3409 * Capture a snapshot if none exists.
3410 */
3411 if (memorystatus_jetsam_snapshot_count == 0) {
3412 memorystatus_init_jetsam_snapshot_locked(NULL,0);
3413 new_snapshot = TRUE;
3414 }
3415
3416 /*
3417 * Mark as terminated so that if exit1() indicates success, but the process (for example)
3418 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
3419 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
3420 * acquisition of the proc lock.
3421 */
3422 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
3423
3424 /* Shift queue, update stats */
3425 memorystatus_update_jetsam_snapshot_entry_locked(p, cause);
3426
3427 /*
3428 * In order to kill the target process, we will drop the proc_list_lock.
3429 * To guaranteee that p and next_p don't disappear out from under the lock,
3430 * we must take a ref on both.
3431 * If we cannot get a reference, then it's likely we've raced with
3432 * that process exiting on another core.
3433 */
3434 if (proc_ref_locked(p) == p) {
3435 if (next_p) {
3436 while (next_p && (proc_ref_locked(next_p) != next_p)) {
3437 proc_t temp_p;
3438
3439 /*
3440 * We must have raced with next_p exiting on another core.
3441 * Recover by getting the next eligible process in the band.
3442 */
3443
3444 MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: skipping %d [%s] (exiting?)\n",
3445 aggr_count, next_p->p_pid, (next_p->p_comm ? next_p->p_comm : "(unknown)"));
3446
3447 temp_p = next_p;
3448 next_p = memorystatus_get_next_proc_locked(&i, temp_p, TRUE);
3449 }
3450 }
3451 proc_list_unlock();
3452
3453 printf("memorystatus: aggressive%d: %s %d [%s] (%s %d) - memorystatus_available_pages: %d\n",
3454 aggr_count,
3455 ((aPid_ep == JETSAM_PRIORITY_IDLE) ? "idle exiting pid" : "jetsam killing pid"),
3456 aPid, (p->p_comm ? p->p_comm : "(unknown)"),
3457 jetsam_kill_cause_name[cause], aPid_ep, memorystatus_available_pages);
3458
3459 killed = memorystatus_do_kill(p, cause);
3460
3461 /* Success? */
3462 if (killed) {
3463 proc_rele(p);
3464 kill_count++;
3465 p = NULL;
3466 killed = FALSE;
3467
3468 /*
3469 * Continue the killing spree.
3470 */
3471 proc_list_lock();
3472 if (next_p) {
3473 proc_rele_locked(next_p);
3474 }
3475 continue;
3476 }
3477
3478 /*
3479 * Failure - first unwind the state,
3480 * then fall through to restart the search.
3481 */
3482 proc_list_lock();
3483 proc_rele_locked(p);
3484 if (next_p) {
3485 proc_rele_locked(next_p);
3486 }
3487 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
3488 p->p_memstat_state |= P_MEMSTAT_ERROR;
3489 *errors += 1;
3490 p = NULL;
3491 }
3492
3493 /*
3494 * Failure - restart the search at the beginning of
3495 * the band we were already traversing.
3496 *
3497 * We might have raced with "p" exiting on another core, resulting in no
3498 * ref on "p". Or, we may have failed to kill "p".
3499 *
3500 * Either way, we fall thru to here, leaving the proc in the
3501 * P_MEMSTAT_TERMINATED or P_MEMSTAT_ERROR state.
3502 *
3503 * And, we hold the the proc_list_lock at this point.
3504 */
3505
3506 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
3507 }
3508
3509 proc_list_unlock();
3510
3511exit:
3512 /* Clear snapshot if freshly captured and no target was found */
3513 if (new_snapshot && (kill_count == 0)) {
3514 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
3515 }
3516
3517 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
3518 memorystatus_available_pages, killed ? aPid : 0, kill_count, 0, 0);
3519
3520 if (kill_count > 0) {
3521 return(TRUE);
3522 }
3523 else {
3524 return(FALSE);
3525 }
3526}
3527
3528#if LEGACY_HIWATER
3529
3530static boolean_t
3531memorystatus_kill_hiwat_proc(uint32_t *errors)
3532{
3533 pid_t aPid = 0;
3534 proc_t p = PROC_NULL, next_p = PROC_NULL;
3535 boolean_t new_snapshot = FALSE, killed = FALSE;
3536 int kill_count = 0;
3537 unsigned int i = 0;
3538 uint32_t aPid_ep;
3539
3540 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_START,
3541 memorystatus_available_pages, 0, 0, 0, 0);
3542
3543 proc_list_lock();
3544
3545 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
3546 while (next_p) {
3547 uint32_t footprint;
3548 boolean_t skip;
3549
3550 p = next_p;
3551 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
3552
39236c6e 3553 aPid = p->p_pid;
3e170ce0 3554 aPid_ep = p->p_memstat_effectivepriority;
316670eb 3555
39236c6e
A
3556 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
3557 continue;
3558 }
3559
3560 /* skip if no limit set */
3561 if (p->p_memstat_memlimit <= 0) {
3562 continue;
d1ecb069 3563 }
3e170ce0
A
3564
3565#if 0
3566 /*
3567 * No need to consider P_MEMSTAT_MEMLIMIT_BACKGROUND anymore.
3568 * Background limits are described via the inactive limit slots.
3569 * Their fatal/non-fatal setting will drive whether or not to be
3570 * considered in this kill path.
3571 */
3572
39236c6e
A
3573 /* skip if a currently inapplicable limit is encountered */
3574 if ((p->p_memstat_state & P_MEMSTAT_MEMLIMIT_BACKGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) {
3575 continue;
3576 }
3e170ce0 3577#endif
39236c6e
A
3578
3579 footprint = (uint32_t)(get_task_phys_footprint(p->task) / (1024 * 1024));
3580 skip = (((int32_t)footprint) <= p->p_memstat_memlimit);
3e170ce0 3581
6d2010ae 3582#if DEVELOPMENT || DEBUG
39236c6e
A
3583 if (!skip && (memorystatus_jetsam_policy & kPolicyDiagnoseActive)) {
3584 if (p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED) {
3585 continue;
6d2010ae 3586 }
39236c6e 3587 }
6d2010ae 3588#endif /* DEVELOPMENT || DEBUG */
316670eb 3589
6d2010ae 3590#if CONFIG_FREEZE
39236c6e
A
3591 if (!skip) {
3592 if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
3593 skip = TRUE;
3594 } else {
3595 skip = FALSE;
3596 }
3597 }
6d2010ae 3598#endif
316670eb 3599
39236c6e
A
3600 if (skip) {
3601 continue;
3602 } else {
fe8ab488
A
3603 MEMORYSTATUS_DEBUG(1, "jetsam: %s pid %d [%s] - %d Mb > 1 (%d Mb)\n",
3604 (memorystatus_jetsam_policy & kPolicyDiagnoseActive) ? "suspending": "killing", aPid, p->p_comm, footprint, p->p_memstat_memlimit);
39236c6e
A
3605
3606 if (memorystatus_jetsam_snapshot_count == 0) {
3e170ce0 3607 memorystatus_init_jetsam_snapshot_locked(NULL,0);
39236c6e
A
3608 new_snapshot = TRUE;
3609 }
3610
3611 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
3612
6d2010ae 3613#if DEVELOPMENT || DEBUG
39236c6e
A
3614 if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
3615 MEMORYSTATUS_DEBUG(1, "jetsam: pid %d suspended for diagnosis - memorystatus_available_pages: %d\n", aPid, memorystatus_available_pages);
3e170ce0 3616 memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusKilledDiagnostic);
39236c6e
A
3617 p->p_memstat_state |= P_MEMSTAT_DIAG_SUSPENDED;
3618
3619 p = proc_ref_locked(p);
3620 proc_list_unlock();
3621 if (p) {
6d2010ae
A
3622 task_suspend(p->task);
3623 proc_rele(p);
39236c6e
A
3624 killed = TRUE;
3625 }
3626
3627 goto exit;
3628 } else
6d2010ae 3629#endif /* DEVELOPMENT || DEBUG */
39236c6e 3630 {
3e170ce0 3631 memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusKilledHiwat);
39236c6e 3632
3e170ce0
A
3633 if (proc_ref_locked(p) == p) {
3634 proc_list_unlock();
3635
3636 printf("memorystatus: jetsam killing pid %d [%s] (highwater %d) - memorystatus_available_pages: %d\n",
3637 aPid, (p->p_comm ? p->p_comm : "(unknown)"), aPid_ep, memorystatus_available_pages);
3638
3639 killed = memorystatus_do_kill(p, kMemorystatusKilledHiwat);
39236c6e 3640
3e170ce0
A
3641 /* Success? */
3642 if (killed) {
3643 proc_rele(p);
3644 kill_count++;
3645 goto exit;
3646 }
3647
3648 /*
3649 * Failure - first unwind the state,
3650 * then fall through to restart the search.
3651 */
3652 proc_list_lock();
3653 proc_rele_locked(p);
3654 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
3655 p->p_memstat_state |= P_MEMSTAT_ERROR;
3656 *errors += 1;
6d2010ae 3657 }
6d2010ae 3658
3e170ce0
A
3659 /*
3660 * Failure - restart the search.
3661 *
3662 * We might have raced with "p" exiting on another core, resulting in no
3663 * ref on "p". Or, we may have failed to kill "p".
3664 *
3665 * Either way, we fall thru to here, leaving the proc in the
3666 * P_MEMSTAT_TERMINATED state.
3667 *
3668 * And, we hold the the proc_list_lock at this point.
3669 */
3670
39236c6e
A
3671 i = 0;
3672 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
3673 }
6d2010ae
A
3674 }
3675 }
316670eb 3676
39236c6e 3677 proc_list_unlock();
316670eb 3678
39236c6e
A
3679exit:
3680 /* Clear snapshot if freshly captured and no target was found */
3681 if (new_snapshot && !killed) {
3682 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
316670eb
A
3683 }
3684
39236c6e 3685 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_END,
3e170ce0 3686 memorystatus_available_pages, killed ? aPid : 0, kill_count, 0, 0);
6d2010ae 3687
39236c6e 3688 return killed;
316670eb 3689}
2d21ac55 3690
39236c6e 3691#endif /* LEGACY_HIWATER */
316670eb 3692
39236c6e
A
3693static boolean_t
3694memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause) {
3695 /* TODO: allow a general async path */
fe8ab488
A
3696 if ((victim_pid != -1) || (cause != kMemorystatusKilledVMPageShortage && cause != kMemorystatusKilledVMThrashing &&
3697 cause != kMemorystatusKilledFCThrashing)) {
39236c6e 3698 return FALSE;
316670eb 3699 }
39236c6e 3700
fe8ab488 3701 kill_under_pressure_cause = cause;
39236c6e
A
3702 memorystatus_thread_wake();
3703 return TRUE;
3704}
2d21ac55 3705
39236c6e
A
3706static boolean_t
3707memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause) {
3708 boolean_t res;
3709 uint32_t errors = 0;
3710
3711 if (victim_pid == -1) {
3712 /* No pid, so kill first process */
3e170ce0 3713 res = memorystatus_kill_top_process(TRUE, TRUE, cause, NULL, &errors);
39236c6e
A
3714 } else {
3715 res = memorystatus_kill_specific_process(victim_pid, cause);
3716 }
3717
3718 if (errors) {
3719 memorystatus_clear_errors();
3720 }
3721
3722 if (res == TRUE) {
3723 /* Fire off snapshot notification */
3724 size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
3725 sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_count;
3e170ce0
A
3726 uint64_t timestamp_now = mach_absolute_time();
3727 memorystatus_jetsam_snapshot->notification_time = timestamp_now;
3728 if (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
3729 timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout) {
3730 int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
3731 if (!ret) {
3732 proc_list_lock();
3733 memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
3734 proc_list_unlock();
3735 }
3736 }
39236c6e
A
3737 }
3738
3739 return res;
3740}
b0d623f7 3741
39236c6e
A
3742boolean_t
3743memorystatus_kill_on_VM_page_shortage(boolean_t async) {
3744 if (async) {
3745 return memorystatus_kill_process_async(-1, kMemorystatusKilledVMPageShortage);
3746 } else {
3747 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMPageShortage);
3748 }
3749}
2d21ac55 3750
39236c6e
A
3751boolean_t
3752memorystatus_kill_on_VM_thrashing(boolean_t async) {
3753 if (async) {
3754 return memorystatus_kill_process_async(-1, kMemorystatusKilledVMThrashing);
3755 } else {
3756 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMThrashing);
2d21ac55
A
3757 }
3758}
b0d623f7 3759
fe8ab488
A
3760boolean_t
3761memorystatus_kill_on_FC_thrashing(boolean_t async) {
3762 if (async) {
3763 return memorystatus_kill_process_async(-1, kMemorystatusKilledFCThrashing);
3764 } else {
3765 return memorystatus_kill_process_sync(-1, kMemorystatusKilledFCThrashing);
3766 }
3767}
3768
39236c6e
A
3769boolean_t
3770memorystatus_kill_on_vnode_limit(void) {
3771 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVnodes);
3772}
3773
316670eb
A
3774#endif /* CONFIG_JETSAM */
3775
6d2010ae
A
3776#if CONFIG_FREEZE
3777
3778__private_extern__ void
316670eb 3779memorystatus_freeze_init(void)
6d2010ae 3780{
316670eb
A
3781 kern_return_t result;
3782 thread_t thread;
3e170ce0
A
3783
3784 freezer_lck_grp_attr = lck_grp_attr_alloc_init();
3785 freezer_lck_grp = lck_grp_alloc_init("freezer", freezer_lck_grp_attr);
3786
3787 lck_mtx_init(&freezer_mutex, freezer_lck_grp, NULL);
39236c6e 3788
316670eb
A
3789 result = kernel_thread_start(memorystatus_freeze_thread, NULL, &thread);
3790 if (result == KERN_SUCCESS) {
3791 thread_deallocate(thread);
3792 } else {
3793 panic("Could not create memorystatus_freeze_thread");
3794 }
6d2010ae
A
3795}
3796
3e170ce0
A
3797/*
3798 * Synchronously freeze the passed proc. Called with a reference to the proc held.
3799 *
3800 * Returns EINVAL or the value returned by task_freeze().
3801 */
3802int
3803memorystatus_freeze_process_sync(proc_t p)
3804{
3805 int ret = EINVAL;
3806 pid_t aPid = 0;
3807 boolean_t memorystatus_freeze_swap_low = FALSE;
3808
3809 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START,
3810 memorystatus_available_pages, 0, 0, 0, 0);
3811
3812 lck_mtx_lock(&freezer_mutex);
3813
3814 if (p == NULL) {
3815 goto exit;
3816 }
3817
3818 if (memorystatus_freeze_enabled == FALSE) {
3819 goto exit;
3820 }
3821
3822 if (!memorystatus_can_freeze(&memorystatus_freeze_swap_low)) {
3823 goto exit;
3824 }
3825
3826 if (memorystatus_freeze_update_throttle()) {
3827 printf("memorystatus_freeze_process_sync: in throttle, ignorning freeze\n");
3828 memorystatus_freeze_throttle_count++;
3829 goto exit;
3830 }
3831
3832 proc_list_lock();
3833
3834 if (p != NULL) {
3835 uint32_t purgeable, wired, clean, dirty, state;
3836 uint32_t max_pages, pages, i;
3837 boolean_t shared;
3838
3839 aPid = p->p_pid;
3840 state = p->p_memstat_state;
3841
3842 /* Ensure the process is eligible for freezing */
3843 if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FROZEN)) || !(state & P_MEMSTAT_SUSPENDED)) {
3844 proc_list_unlock();
3845 goto exit;
3846 }
3847
3848 /* Only freeze processes meeting our minimum resident page criteria */
3849 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
3850 if (pages < memorystatus_freeze_pages_min) {
3851 proc_list_unlock();
3852 goto exit;
3853 }
3854
3855 if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) {
3856
3857 unsigned int avail_swap_space = 0; /* in pages. */
3858
3859 if (DEFAULT_FREEZER_IS_ACTIVE) {
3860 /*
3861 * Freezer backed by default pager and swap file(s).
3862 */
3863 avail_swap_space = default_pager_swap_pages_free();
3864 } else {
3865 /*
3866 * Freezer backed by the compressor and swap file(s)
3867 * while will hold compressed data.
3868 */
3869 avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64;
3870 }
3871
3872 max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max);
3873
3874 if (max_pages < memorystatus_freeze_pages_min) {
3875 proc_list_unlock();
3876 goto exit;
3877 }
3878 } else {
3879 /*
3880 * We only have the compressor without any swap.
3881 */
3882 max_pages = UINT32_MAX - 1;
3883 }
3884
3885 /* Mark as locked temporarily to avoid kill */
3886 p->p_memstat_state |= P_MEMSTAT_LOCKED;
3887 proc_list_unlock();
3888
3889 ret = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
3890
3891 MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_process_sync: task_freeze %s for pid %d [%s] - "
3892 "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, shared %d, free swap: %d\n",
3893 (ret == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (p->p_comm ? p->p_comm : "(unknown)"),
3894 memorystatus_available_pages, purgeable, wired, clean, dirty, shared, default_pager_swap_pages_free());
3895
3896 proc_list_lock();
3897 p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
3898
3899 if (ret == KERN_SUCCESS) {
3900 memorystatus_freeze_entry_t data = { aPid, TRUE, dirty };
3901
3902 memorystatus_frozen_count++;
3903
3904 p->p_memstat_state |= (P_MEMSTAT_FROZEN | (shared ? 0: P_MEMSTAT_NORECLAIM));
3905
3906 if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) {
3907 /* Update stats */
3908 for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
3909 throttle_intervals[i].pageouts += dirty;
3910 }
3911 }
3912
3913 memorystatus_freeze_pageouts += dirty;
3914 memorystatus_freeze_count++;
3915
3916 proc_list_unlock();
3917
3918 memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
3919 } else {
3920 proc_list_unlock();
3921 }
3922 }
3923
3924exit:
3925 lck_mtx_unlock(&freezer_mutex);
3926 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END,
3927 memorystatus_available_pages, aPid, 0, 0, 0);
3928
3929 return ret;
3930}
3931
316670eb 3932static int
39236c6e 3933memorystatus_freeze_top_process(boolean_t *memorystatus_freeze_swap_low)
6d2010ae 3934{
39236c6e
A
3935 pid_t aPid = 0;
3936 int ret = -1;
3937 proc_t p = PROC_NULL, next_p = PROC_NULL;
3938 unsigned int i = 0;
6d2010ae 3939
39236c6e
A
3940 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START,
3941 memorystatus_available_pages, 0, 0, 0, 0);
3942
3943 proc_list_lock();
6d2010ae 3944
39236c6e
A
3945 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
3946 while (next_p) {
3947 kern_return_t kr;
3948 uint32_t purgeable, wired, clean, dirty;
3949 boolean_t shared;
3950 uint32_t pages;
3951 uint32_t max_pages = 0;
316670eb
A
3952 uint32_t state;
3953
39236c6e
A
3954 p = next_p;
3955 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
6d2010ae 3956
39236c6e
A
3957 aPid = p->p_pid;
3958 state = p->p_memstat_state;
6d2010ae 3959
316670eb 3960 /* Ensure the process is eligible for freezing */
39236c6e 3961 if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FROZEN)) || !(state & P_MEMSTAT_SUSPENDED)) {
316670eb
A
3962 continue; // with lock held
3963 }
316670eb 3964
39236c6e 3965 /* Only freeze processes meeting our minimum resident page criteria */
fe8ab488 3966 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
39236c6e
A
3967 if (pages < memorystatus_freeze_pages_min) {
3968 continue; // with lock held
3969 }
6d2010ae 3970
fe8ab488 3971 if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) {
3e170ce0
A
3972
3973 /* Ensure there's enough free space to freeze this process. */
3974
3975 unsigned int avail_swap_space = 0; /* in pages. */
3976
3977 if (DEFAULT_FREEZER_IS_ACTIVE) {
3978 /*
3979 * Freezer backed by default pager and swap file(s).
3980 */
3981 avail_swap_space = default_pager_swap_pages_free();
3982 } else {
3983 /*
3984 * Freezer backed by the compressor and swap file(s)
3985 * while will hold compressed data.
3986 */
3987 avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64;
3988 }
3989
3990 max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max);
3991
316670eb
A
3992 if (max_pages < memorystatus_freeze_pages_min) {
3993 *memorystatus_freeze_swap_low = TRUE;
39236c6e
A
3994 proc_list_unlock();
3995 goto exit;
316670eb 3996 }
39236c6e 3997 } else {
3e170ce0
A
3998 /*
3999 * We only have the compressor pool.
4000 */
39236c6e
A
4001 max_pages = UINT32_MAX - 1;
4002 }
4003
4004 /* Mark as locked temporarily to avoid kill */
4005 p->p_memstat_state |= P_MEMSTAT_LOCKED;
4006
4007 p = proc_ref_locked(p);
4008 proc_list_unlock();
4009 if (!p) {
4010 goto exit;
4011 }
4012
4013 kr = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
4014
4015 MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_top_process: task_freeze %s for pid %d [%s] - "
4016 "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, shared %d, free swap: %d\n",
4017 (kr == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (p->p_comm ? p->p_comm : "(unknown)"),
4018 memorystatus_available_pages, purgeable, wired, clean, dirty, shared, default_pager_swap_pages_free());
4019
4020 proc_list_lock();
4021 p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
4022
4023 /* Success? */
4024 if (KERN_SUCCESS == kr) {
4025 memorystatus_freeze_entry_t data = { aPid, TRUE, dirty };
316670eb 4026
39236c6e 4027 memorystatus_frozen_count++;
316670eb 4028
39236c6e
A
4029 p->p_memstat_state |= (P_MEMSTAT_FROZEN | (shared ? 0: P_MEMSTAT_NORECLAIM));
4030
3e170ce0
A
4031 if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) {
4032 /* Update stats */
4033 for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
4034 throttle_intervals[i].pageouts += dirty;
4035 }
39236c6e 4036 }
3e170ce0 4037
39236c6e
A
4038 memorystatus_freeze_pageouts += dirty;
4039 memorystatus_freeze_count++;
4040
4041 proc_list_unlock();
6d2010ae 4042
39236c6e 4043 memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
6d2010ae 4044
3e170ce0
A
4045 /* Return KERN_SUCESS */
4046 ret = kr;
6d2010ae 4047
39236c6e
A
4048 } else {
4049 proc_list_unlock();
316670eb 4050 }
39236c6e
A
4051
4052 proc_rele(p);
4053 goto exit;
6d2010ae 4054 }
316670eb 4055
39236c6e
A
4056 proc_list_unlock();
4057
4058exit:
4059 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END,
4060 memorystatus_available_pages, aPid, 0, 0, 0);
316670eb 4061
39236c6e 4062 return ret;
6d2010ae
A
4063}
4064
316670eb
A
4065static inline boolean_t
4066memorystatus_can_freeze_processes(void)
6d2010ae 4067{
316670eb 4068 boolean_t ret;
6d2010ae 4069
39236c6e 4070 proc_list_lock();
316670eb
A
4071
4072 if (memorystatus_suspended_count) {
4073 uint32_t average_resident_pages, estimated_processes;
4074
4075 /* Estimate the number of suspended processes we can fit */
39236c6e 4076 average_resident_pages = memorystatus_suspended_footprint_total / memorystatus_suspended_count;
316670eb
A
4077 estimated_processes = memorystatus_suspended_count +
4078 ((memorystatus_available_pages - memorystatus_available_pages_critical) / average_resident_pages);
4079
4080 /* If it's predicted that no freeze will occur, lower the threshold temporarily */
4081 if (estimated_processes <= FREEZE_SUSPENDED_THRESHOLD_DEFAULT) {
4082 memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_LOW;
6d2010ae 4083 } else {
39236c6e 4084 memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT;
6d2010ae 4085 }
6d2010ae 4086
316670eb
A
4087 MEMORYSTATUS_DEBUG(1, "memorystatus_can_freeze_processes: %d suspended processes, %d average resident pages / process, %d suspended processes estimated\n",
4088 memorystatus_suspended_count, average_resident_pages, estimated_processes);
6d2010ae 4089
316670eb
A
4090 if ((memorystatus_suspended_count - memorystatus_frozen_count) > memorystatus_freeze_suspended_threshold) {
4091 ret = TRUE;
4092 } else {
4093 ret = FALSE;
6d2010ae 4094 }
316670eb
A
4095 } else {
4096 ret = FALSE;
6d2010ae 4097 }
316670eb 4098
39236c6e 4099 proc_list_unlock();
6d2010ae 4100
316670eb 4101 return ret;
6d2010ae
A
4102}
4103
316670eb
A
4104static boolean_t
4105memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low)
6d2010ae 4106{
3e170ce0
A
4107 boolean_t can_freeze = TRUE;
4108
316670eb
A
4109 /* Only freeze if we're sufficiently low on memory; this holds off freeze right
4110 after boot, and is generally is a no-op once we've reached steady state. */
4111 if (memorystatus_available_pages > memorystatus_freeze_threshold) {
4112 return FALSE;
4113 }
4114
4115 /* Check minimum suspended process threshold. */
4116 if (!memorystatus_can_freeze_processes()) {
4117 return FALSE;
4118 }
6d2010ae 4119
3e170ce0
A
4120 if (COMPRESSED_PAGER_IS_SWAPLESS || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPLESS) {
4121 /*
4122 * In-core compressor used for freezing WITHOUT on-disk swap support.
4123 */
4124
4125 if (vm_compressor_low_on_space()) {
4126 if (*memorystatus_freeze_swap_low) {
4127 *memorystatus_freeze_swap_low = TRUE;
4128 }
4129
4130 can_freeze = FALSE;
4131
4132 } else {
4133 if (*memorystatus_freeze_swap_low) {
4134 *memorystatus_freeze_swap_low = FALSE;
4135 }
4136
4137 can_freeze = TRUE;
4138 }
4139 } else {
4140 /*
4141 * Freezing WITH on-disk swap support.
4142 */
4143
4144 if (DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) {
4145 /*
4146 * In-core compressor fronts the swap.
4147 */
4148 if (vm_swap_low_on_space()) {
4149 if (*memorystatus_freeze_swap_low) {
4150 *memorystatus_freeze_swap_low = TRUE;
4151 }
4152
4153 can_freeze = FALSE;
4154 }
4155
4156 } else if (DEFAULT_FREEZER_IS_ACTIVE) {
4157 /*
4158 * Legacy freeze mode with no compressor support.
4159 */
4160 if (default_pager_swap_pages_free() < memorystatus_freeze_pages_min) {
4161 if (*memorystatus_freeze_swap_low) {
4162 *memorystatus_freeze_swap_low = TRUE;
4163 }
4164
4165 can_freeze = FALSE;
4166 }
4167 } else {
4168 panic("Not a valid freeze configuration.\n");
316670eb 4169 }
6d2010ae
A
4170 }
4171
3e170ce0 4172 return can_freeze;
6d2010ae
A
4173}
4174
4175static void
316670eb 4176memorystatus_freeze_update_throttle_interval(mach_timespec_t *ts, struct throttle_interval_t *interval)
6d2010ae 4177{
3e170ce0 4178 unsigned int freeze_daily_pageouts_max = memorystatus_freeze_daily_mb_max * (1024 * 1024 / PAGE_SIZE);
6d2010ae
A
4179 if (CMP_MACH_TIMESPEC(ts, &interval->ts) >= 0) {
4180 if (!interval->max_pageouts) {
3e170ce0 4181 interval->max_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * freeze_daily_pageouts_max) / (24 * 60)));
6d2010ae 4182 } else {
316670eb 4183 printf("memorystatus_freeze_update_throttle_interval: %d minute throttle timeout, resetting\n", interval->mins);
6d2010ae
A
4184 }
4185 interval->ts.tv_sec = interval->mins * 60;
4186 interval->ts.tv_nsec = 0;
4187 ADD_MACH_TIMESPEC(&interval->ts, ts);
316670eb 4188 /* Since we update the throttle stats pre-freeze, adjust for overshoot here */
6d2010ae
A
4189 if (interval->pageouts > interval->max_pageouts) {
4190 interval->pageouts -= interval->max_pageouts;
4191 } else {
4192 interval->pageouts = 0;
4193 }
4194 interval->throttle = FALSE;
4195 } else if (!interval->throttle && interval->pageouts >= interval->max_pageouts) {
316670eb 4196 printf("memorystatus_freeze_update_throttle_interval: %d minute pageout limit exceeded; enabling throttle\n", interval->mins);
6d2010ae
A
4197 interval->throttle = TRUE;
4198 }
316670eb
A
4199
4200 MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_update_throttle_interval: throttle updated - %d frozen (%d max) within %dm; %dm remaining; throttle %s\n",
6d2010ae
A
4201 interval->pageouts, interval->max_pageouts, interval->mins, (interval->ts.tv_sec - ts->tv_sec) / 60,
4202 interval->throttle ? "on" : "off");
6d2010ae
A
4203}
4204
4205static boolean_t
316670eb 4206memorystatus_freeze_update_throttle(void)
6d2010ae
A
4207{
4208 clock_sec_t sec;
4209 clock_nsec_t nsec;
4210 mach_timespec_t ts;
4211 uint32_t i;
4212 boolean_t throttled = FALSE;
4213
4214#if DEVELOPMENT || DEBUG
316670eb 4215 if (!memorystatus_freeze_throttle_enabled)
6d2010ae
A
4216 return FALSE;
4217#endif
4218
4219 clock_get_system_nanotime(&sec, &nsec);
4220 ts.tv_sec = sec;
4221 ts.tv_nsec = nsec;
4222
316670eb 4223 /* Check freeze pageouts over multiple intervals and throttle if we've exceeded our budget.
6d2010ae 4224 *
316670eb 4225 * This ensures that periods of inactivity can't be used as 'credit' towards freeze if the device has
6d2010ae
A
4226 * remained dormant for a long period. We do, however, allow increased thresholds for shorter intervals in
4227 * order to allow for bursts of activity.
4228 */
4229 for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
316670eb 4230 memorystatus_freeze_update_throttle_interval(&ts, &throttle_intervals[i]);
6d2010ae
A
4231 if (throttle_intervals[i].throttle == TRUE)
4232 throttled = TRUE;
4233 }
4234
4235 return throttled;
4236}
4237
4238static void
316670eb 4239memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused)
6d2010ae 4240{
316670eb 4241 static boolean_t memorystatus_freeze_swap_low = FALSE;
3e170ce0
A
4242
4243 lck_mtx_lock(&freezer_mutex);
316670eb
A
4244 if (memorystatus_freeze_enabled) {
4245 if (memorystatus_can_freeze(&memorystatus_freeze_swap_low)) {
3e170ce0
A
4246 /* Only freeze if we've not exceeded our pageout budgets.*/
4247 if (!memorystatus_freeze_update_throttle()) {
39236c6e 4248 memorystatus_freeze_top_process(&memorystatus_freeze_swap_low);
316670eb
A
4249 } else {
4250 printf("memorystatus_freeze_thread: in throttle, ignoring freeze\n");
4251 memorystatus_freeze_throttle_count++; /* Throttled, update stats */
4252 }
4253 }
4254 }
3e170ce0 4255 lck_mtx_unlock(&freezer_mutex);
6d2010ae 4256
316670eb
A
4257 assert_wait((event_t) &memorystatus_freeze_wakeup, THREAD_UNINT);
4258 thread_block((thread_continue_t) memorystatus_freeze_thread);
4259}
4260
4261#endif /* CONFIG_FREEZE */
6d2010ae 4262
fe8ab488 4263#if VM_PRESSURE_EVENTS
6d2010ae 4264
fe8ab488 4265#if CONFIG_MEMORYSTATUS
316670eb 4266
fe8ab488
A
4267static int
4268memorystatus_send_note(int event_code, void *data, size_t data_length) {
4269 int ret;
4270 struct kev_msg ev_msg;
316670eb 4271
fe8ab488
A
4272 ev_msg.vendor_code = KEV_VENDOR_APPLE;
4273 ev_msg.kev_class = KEV_SYSTEM_CLASS;
4274 ev_msg.kev_subclass = KEV_MEMORYSTATUS_SUBCLASS;
4275
4276 ev_msg.event_code = event_code;
4277
4278 ev_msg.dv[0].data_length = data_length;
4279 ev_msg.dv[0].data_ptr = data;
4280 ev_msg.dv[1].data_length = 0;
4281
4282 ret = kev_post_msg(&ev_msg);
4283 if (ret) {
4284 printf("%s: kev_post_msg() failed, err %d\n", __func__, ret);
316670eb 4285 }
39236c6e 4286
fe8ab488 4287 return ret;
316670eb
A
4288}
4289
fe8ab488
A
4290boolean_t
4291memorystatus_warn_process(pid_t pid, boolean_t critical) {
316670eb 4292
fe8ab488 4293 boolean_t ret = FALSE;
3e170ce0 4294 boolean_t found_knote = FALSE;
fe8ab488 4295 struct knote *kn = NULL;
316670eb 4296
fe8ab488
A
4297 /*
4298 * See comment in sysctl_memorystatus_vm_pressure_send.
4299 */
39236c6e 4300
fe8ab488 4301 memorystatus_klist_lock();
3e170ce0
A
4302
4303 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
4304 proc_t knote_proc = kn->kn_kq->kq_p;
4305 pid_t knote_pid = knote_proc->p_pid;
4306
4307 if (knote_pid == pid) {
4308 /*
4309 * By setting the "fflags" here, we are forcing
4310 * a process to deal with the case where it's
4311 * bumping up into its memory limits. If we don't
4312 * do this here, we will end up depending on the
4313 * system pressure snapshot evaluation in
4314 * filt_memorystatus().
4315 */
39236c6e 4316
3e170ce0
A
4317 if (critical) {
4318 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
4319 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
4320 } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
4321 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
4322 }
4323 } else {
4324 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
4325 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
4326 }
4327 }
4328
4329 found_knote = TRUE;
39236c6e 4330 }
3e170ce0
A
4331 }
4332
4333 if (found_knote) {
4334 KNOTE(&memorystatus_klist, 0);
4335 ret = TRUE;
fe8ab488
A
4336 } else {
4337 if (vm_dispatch_pressure_note_to_pid(pid, FALSE) == 0) {
4338 ret = TRUE;
6d2010ae
A
4339 }
4340 }
3e170ce0 4341
fe8ab488 4342 memorystatus_klist_unlock();
6d2010ae 4343
fe8ab488 4344 return ret;
316670eb
A
4345}
4346
3e170ce0
A
4347/*
4348 * Can only be set by the current task on itself.
4349 */
4350int
4351memorystatus_low_mem_privileged_listener(uint32_t op_flags)
4352{
4353 boolean_t set_privilege = FALSE;
4354 /*
4355 * Need an entitlement check here?
4356 */
4357 if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE) {
4358 set_privilege = TRUE;
4359 } else if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE) {
4360 set_privilege = FALSE;
4361 } else {
4362 return EINVAL;
4363 }
4364
4365 return (task_low_mem_privileged_listener(current_task(), set_privilege, NULL));
4366}
4367
39236c6e 4368int
316670eb 4369memorystatus_send_pressure_note(pid_t pid) {
39236c6e
A
4370 MEMORYSTATUS_DEBUG(1, "memorystatus_send_pressure_note(): pid %d\n", pid);
4371 return memorystatus_send_note(kMemorystatusPressureNote, &pid, sizeof(pid));
6d2010ae
A
4372}
4373
fe8ab488
A
4374void
4375memorystatus_send_low_swap_note(void) {
4376
4377 struct knote *kn = NULL;
3e170ce0 4378
fe8ab488
A
4379 memorystatus_klist_lock();
4380 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
3e170ce0
A
4381 /* We call is_knote_registered_modify_task_pressure_bits to check if the sfflags for the
4382 * current note contain NOTE_MEMORYSTATUS_LOW_SWAP. Once we find one note in the memorystatus_klist
4383 * that has the NOTE_MEMORYSTATUS_LOW_SWAP flags in its sfflags set, we call KNOTE with
4384 * kMemoryStatusLowSwap as the hint to process and update all knotes on the memorystatus_klist accordingly. */
fe8ab488 4385 if (is_knote_registered_modify_task_pressure_bits(kn, NOTE_MEMORYSTATUS_LOW_SWAP, NULL, 0, 0) == TRUE) {
3e170ce0
A
4386 KNOTE(&memorystatus_klist, kMemorystatusLowSwap);
4387 break;
fe8ab488
A
4388 }
4389 }
3e170ce0 4390
fe8ab488
A
4391 memorystatus_klist_unlock();
4392}
4393
39236c6e
A
4394boolean_t
4395memorystatus_bg_pressure_eligible(proc_t p) {
4396 boolean_t eligible = FALSE;
4397
4398 proc_list_lock();
4399
4400 MEMORYSTATUS_DEBUG(1, "memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", p->p_pid, p->p_memstat_state);
4401
4402 /* Foreground processes have already been dealt with at this point, so just test for eligibility */
4403 if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) {
4404 eligible = TRUE;
4405 }
4406
4407 proc_list_unlock();
4408
4409 return eligible;
4410}
4411
4412boolean_t
4413memorystatus_is_foreground_locked(proc_t p) {
4414 return ((p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) ||
4415 (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT));
4416}
fe8ab488 4417#endif /* CONFIG_MEMORYSTATUS */
39236c6e
A
4418
4419/*
4420 * Trigger levels to test the mechanism.
4421 * Can be used via a sysctl.
4422 */
4423#define TEST_LOW_MEMORY_TRIGGER_ONE 1
4424#define TEST_LOW_MEMORY_TRIGGER_ALL 2
4425#define TEST_PURGEABLE_TRIGGER_ONE 3
4426#define TEST_PURGEABLE_TRIGGER_ALL 4
4427#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE 5
4428#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL 6
4429
4430boolean_t memorystatus_manual_testing_on = FALSE;
4431vm_pressure_level_t memorystatus_manual_testing_level = kVMPressureNormal;
4432
4433extern struct knote *
fe8ab488 4434vm_pressure_select_optimal_candidate_to_notify(struct klist *, int, boolean_t);
39236c6e
A
4435
4436extern
fe8ab488 4437kern_return_t vm_pressure_notification_without_levels(boolean_t);
39236c6e
A
4438
4439extern void vm_pressure_klist_lock(void);
4440extern void vm_pressure_klist_unlock(void);
4441
4442extern void vm_reset_active_list(void);
4443
4444extern void delay(int);
4445
4446#define INTER_NOTIFICATION_DELAY (250000) /* .25 second */
4447
4448void memorystatus_on_pageout_scan_end(void) {
4449 /* No-op */
4450}
4451
4452/*
4453 * kn_max - knote
4454 *
4455 * knote_pressure_level - to check if the knote is registered for this notification level.
4456 *
4457 * task - task whose bits we'll be modifying
4458 *
4459 * pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again.
4460 *
4461 * pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately.
4462 *
4463 */
39236c6e
A
4464
4465boolean_t
4466is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set)
4467{
4468 if (kn_max->kn_sfflags & knote_pressure_level) {
4469
4470 if (task_has_been_notified(task, pressure_level_to_clear) == TRUE) {
4471
4472 task_clear_has_been_notified(task, pressure_level_to_clear);
4473 }
4474
4475 task_mark_has_been_notified(task, pressure_level_to_set);
4476 return TRUE;
4477 }
4478
4479 return FALSE;
4480}
4481
fe8ab488
A
4482extern kern_return_t vm_pressure_notify_dispatch_vm_clients(boolean_t target_foreground_process);
4483
4484#define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD 5000 /* milliseconds */
39236c6e
A
4485
4486kern_return_t
fe8ab488 4487memorystatus_update_vm_pressure(boolean_t target_foreground_process)
39236c6e
A
4488{
4489 struct knote *kn_max = NULL;
3e170ce0 4490 struct knote *kn_cur = NULL, *kn_temp = NULL; /* for safe list traversal */
39236c6e
A
4491 pid_t target_pid = -1;
4492 struct klist dispatch_klist = { NULL };
4493 proc_t target_proc = PROC_NULL;
39236c6e
A
4494 struct task *task = NULL;
4495 boolean_t found_candidate = FALSE;
4496
fe8ab488
A
4497 static vm_pressure_level_t level_snapshot = kVMPressureNormal;
4498 static vm_pressure_level_t prev_level_snapshot = kVMPressureNormal;
4499 boolean_t smoothing_window_started = FALSE;
4500 struct timeval smoothing_window_start_tstamp = {0, 0};
4501 struct timeval curr_tstamp = {0, 0};
4502 int elapsed_msecs = 0;
4503
4504#if !CONFIG_JETSAM
4505#define MAX_IDLE_KILLS 100 /* limit the number of idle kills allowed */
4506
4507 int idle_kill_counter = 0;
4508
4509 /*
4510 * On desktop we take this opportunity to free up memory pressure
4511 * by immediately killing idle exitable processes. We use a delay
4512 * to avoid overkill. And we impose a max counter as a fail safe
4513 * in case daemons re-launch too fast.
4514 */
4515 while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) {
4516 if (memorystatus_idle_exit_from_VM() == FALSE) {
4517 /* No idle exitable processes left to kill */
4518 break;
4519 }
4520 idle_kill_counter++;
3e170ce0
A
4521
4522 if (memorystatus_manual_testing_on == TRUE) {
4523 /*
4524 * Skip the delay when testing
4525 * the pressure notification scheme.
4526 */
4527 } else {
4528 delay(1000000); /* 1 second */
4529 }
fe8ab488
A
4530 }
4531#endif /* !CONFIG_JETSAM */
4532
39236c6e
A
4533 while (1) {
4534
4535 /*
4536 * There is a race window here. But it's not clear
4537 * how much we benefit from having extra synchronization.
4538 */
4539 level_snapshot = memorystatus_vm_pressure_level;
4540
fe8ab488
A
4541 if (prev_level_snapshot > level_snapshot) {
4542 /*
4543 * Pressure decreased? Let's take a little breather
4544 * and see if this condition stays.
4545 */
4546 if (smoothing_window_started == FALSE) {
4547
4548 smoothing_window_started = TRUE;
4549 microuptime(&smoothing_window_start_tstamp);
4550 }
4551
4552 microuptime(&curr_tstamp);
4553 timevalsub(&curr_tstamp, &smoothing_window_start_tstamp);
4554 elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
4555
4556 if (elapsed_msecs < VM_PRESSURE_DECREASED_SMOOTHING_PERIOD) {
4557
4558 delay(INTER_NOTIFICATION_DELAY);
4559 continue;
4560 }
4561 }
4562
4563 prev_level_snapshot = level_snapshot;
4564 smoothing_window_started = FALSE;
4565
39236c6e 4566 memorystatus_klist_lock();
fe8ab488 4567 kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot, target_foreground_process);
39236c6e
A
4568
4569 if (kn_max == NULL) {
4570 memorystatus_klist_unlock();
4571
4572 /*
4573 * No more level-based clients to notify.
4574 * Try the non-level based notification clients.
4575 *
4576 * However, these non-level clients don't understand
4577 * the "return-to-normal" notification.
4578 *
4579 * So don't consider them for those notifications. Just
4580 * return instead.
4581 *
4582 */
4583
4584 if (level_snapshot != kVMPressureNormal) {
4585 goto try_dispatch_vm_clients;
4586 } else {
4587 return KERN_FAILURE;
4588 }
4589 }
4590
4591 target_proc = kn_max->kn_kq->kq_p;
4592
4593 proc_list_lock();
4594 if (target_proc != proc_ref_locked(target_proc)) {
4595 target_proc = PROC_NULL;
4596 proc_list_unlock();
4597 memorystatus_klist_unlock();
4598 continue;
4599 }
4600 proc_list_unlock();
39236c6e
A
4601
4602 target_pid = target_proc->p_pid;
4603
4604 task = (struct task *)(target_proc->task);
4605
4606 if (level_snapshot != kVMPressureNormal) {
4607
4608 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
4609
4610 if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, kVMPressureCritical, kVMPressureWarning) == TRUE) {
4611 found_candidate = TRUE;
4612 }
4613 } else {
4614 if (level_snapshot == kVMPressureCritical) {
4615
4616 if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, kVMPressureWarning, kVMPressureCritical) == TRUE) {
4617 found_candidate = TRUE;
4618 }
4619 }
4620 }
4621 } else {
4622 if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
4623
4624 task_clear_has_been_notified(task, kVMPressureWarning);
4625 task_clear_has_been_notified(task, kVMPressureCritical);
4626
4627 found_candidate = TRUE;
6d2010ae
A
4628 }
4629 }
39236c6e
A
4630
4631 if (found_candidate == FALSE) {
3e170ce0
A
4632 proc_rele(target_proc);
4633 memorystatus_klist_unlock();
39236c6e
A
4634 continue;
4635 }
4636
3e170ce0
A
4637 SLIST_FOREACH_SAFE(kn_cur, &memorystatus_klist, kn_selnext, kn_temp) {
4638 proc_t knote_proc = kn_cur->kn_kq->kq_p;
4639 pid_t knote_pid = knote_proc->p_pid;
4640 if (knote_pid == target_pid) {
4641 KNOTE_DETACH(&memorystatus_klist, kn_cur);
4642 KNOTE_ATTACH(&dispatch_klist, kn_cur);
4643 }
4644 }
39236c6e
A
4645
4646 KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure);
4647
3e170ce0
A
4648 SLIST_FOREACH_SAFE(kn_cur, &dispatch_klist, kn_selnext, kn_temp) {
4649 KNOTE_DETACH(&dispatch_klist, kn_cur);
4650 KNOTE_ATTACH(&memorystatus_klist, kn_cur);
4651 }
4652
39236c6e
A
4653 memorystatus_klist_unlock();
4654
4655 microuptime(&target_proc->vm_pressure_last_notify_tstamp);
4656 proc_rele(target_proc);
4657
fe8ab488 4658 if (memorystatus_manual_testing_on == TRUE && target_foreground_process == TRUE) {
39236c6e
A
4659 break;
4660 }
4661
4662try_dispatch_vm_clients:
fe8ab488
A
4663 if (kn_max == NULL && level_snapshot != kVMPressureNormal) {
4664 /*
4665 * We will exit this loop when we are done with
4666 * notification clients (level and non-level based).
39236c6e 4667 */
fe8ab488 4668 if ((vm_pressure_notify_dispatch_vm_clients(target_foreground_process) == KERN_FAILURE) && (kn_max == NULL)) {
39236c6e
A
4669 /*
4670 * kn_max == NULL i.e. we didn't find any eligible clients for the level-based notifications
4671 * AND
4672 * we have failed to find any eligible clients for the non-level based notifications too.
4673 * So, we are done.
4674 */
4675
4676 return KERN_FAILURE;
4677 }
4678 }
4679
fe8ab488
A
4680 /*
4681 * LD: This block of code below used to be invoked in the older memory notification scheme on embedded everytime
4682 * a process was sent a memory pressure notification. The "memorystatus_klist" list was used to hold these
4683 * privileged listeners. But now we have moved to the newer scheme and are trying to move away from the extra
4684 * notifications. So the code is here in case we break compat. and need to send out notifications to the privileged
4685 * apps.
4686 */
4687#if 0
4688#endif /* 0 */
4689
4690 if (memorystatus_manual_testing_on == TRUE) {
4691 /*
4692 * Testing out the pressure notification scheme.
4693 * No need for delays etc.
4694 */
4695 } else {
4696
4697 uint32_t sleep_interval = INTER_NOTIFICATION_DELAY;
4698#if CONFIG_JETSAM
4699 unsigned int page_delta = 0;
4700 unsigned int skip_delay_page_threshold = 0;
4701
4702 assert(memorystatus_available_pages_pressure >= memorystatus_available_pages_critical_base);
4703
4704 page_delta = (memorystatus_available_pages_pressure - memorystatus_available_pages_critical_base) / 2;
4705 skip_delay_page_threshold = memorystatus_available_pages_pressure - page_delta;
4706
4707 if (memorystatus_available_pages <= skip_delay_page_threshold) {
4708 /*
4709 * We are nearing the critcal mark fast and can't afford to wait between
4710 * notifications.
4711 */
4712 sleep_interval = 0;
4713 }
4714#endif /* CONFIG_JETSAM */
4715
4716 if (sleep_interval) {
4717 delay(sleep_interval);
4718 }
39236c6e 4719 }
6d2010ae 4720 }
39236c6e
A
4721
4722 return KERN_SUCCESS;
6d2010ae
A
4723}
4724
39236c6e
A
4725vm_pressure_level_t
4726convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
4727
4728vm_pressure_level_t
4729convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)
4730{
4731 vm_pressure_level_t dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
4732
4733 switch (internal_pressure_level) {
4734
4735 case kVMPressureNormal:
4736 {
4737 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
4738 break;
4739 }
4740
4741 case kVMPressureWarning:
4742 case kVMPressureUrgent:
4743 {
4744 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN;
4745 break;
4746 }
4747
4748 case kVMPressureCritical:
4749 {
4750 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
4751 break;
4752 }
4753
4754 default:
4755 break;
4756 }
316670eb 4757
39236c6e
A
4758 return dispatch_level;
4759}
6d2010ae 4760
b0d623f7 4761static int
39236c6e 4762sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS
b0d623f7 4763{
39236c6e 4764#pragma unused(arg1, arg2, oidp)
39236c6e
A
4765 vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(memorystatus_vm_pressure_level);
4766
4767 return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level));
4768}
4769
fe8ab488
A
4770#if DEBUG || DEVELOPMENT
4771
39236c6e
A
4772SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_LOCKED,
4773 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
4774
fe8ab488
A
4775#else /* DEBUG || DEVELOPMENT */
4776
4777SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_LOCKED|CTLFLAG_MASKED,
4778 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
4779
4780#endif /* DEBUG || DEVELOPMENT */
b0d623f7 4781
39236c6e
A
4782extern int memorystatus_purge_on_warning;
4783extern int memorystatus_purge_on_critical;
4784
4785static int
4786sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
4787{
4788#pragma unused(arg1, arg2)
b0d623f7 4789
39236c6e
A
4790 int level = 0;
4791 int error = 0;
4792 int pressure_level = 0;
4793 int trigger_request = 0;
4794 int force_purge;
4795
4796 error = sysctl_handle_int(oidp, &level, 0, req);
4797 if (error || !req->newptr) {
4798 return (error);
4799 }
4800
4801 memorystatus_manual_testing_on = TRUE;
4802
4803 trigger_request = (level >> 16) & 0xFFFF;
4804 pressure_level = (level & 0xFFFF);
4805
4806 if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE ||
4807 trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) {
4808 return EINVAL;
4809 }
4810 switch (pressure_level) {
4811 case NOTE_MEMORYSTATUS_PRESSURE_NORMAL:
4812 case NOTE_MEMORYSTATUS_PRESSURE_WARN:
4813 case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL:
4814 break;
4815 default:
b0d623f7
A
4816 return EINVAL;
4817 }
b0d623f7 4818
39236c6e
A
4819 /*
4820 * The pressure level is being set from user-space.
4821 * And user-space uses the constants in sys/event.h
4822 * So we translate those events to our internal levels here.
4823 */
4824 if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
4825
4826 memorystatus_manual_testing_level = kVMPressureNormal;
4827 force_purge = 0;
4828
4829 } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) {
4830
4831 memorystatus_manual_testing_level = kVMPressureWarning;
4832 force_purge = memorystatus_purge_on_warning;
4833
4834 } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
4835
4836 memorystatus_manual_testing_level = kVMPressureCritical;
4837 force_purge = memorystatus_purge_on_critical;
b0d623f7
A
4838 }
4839
39236c6e 4840 memorystatus_vm_pressure_level = memorystatus_manual_testing_level;
316670eb 4841
39236c6e
A
4842 /* purge according to the new pressure level */
4843 switch (trigger_request) {
4844 case TEST_PURGEABLE_TRIGGER_ONE:
4845 case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE:
4846 if (force_purge == 0) {
4847 /* no purging requested */
4848 break;
4849 }
4850 vm_purgeable_object_purge_one_unlocked(force_purge);
4851 break;
4852 case TEST_PURGEABLE_TRIGGER_ALL:
4853 case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL:
4854 if (force_purge == 0) {
4855 /* no purging requested */
4856 break;
4857 }
4858 while (vm_purgeable_object_purge_one_unlocked(force_purge));
4859 break;
4860 }
4861
4862 if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) ||
4863 (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) {
4864
4865 memorystatus_update_vm_pressure(TRUE);
4866 }
4867
4868 if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) ||
4869 (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) {
4870
4871 while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) {
4872 continue;
4873 }
4874 }
4875
4876 if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
4877 memorystatus_manual_testing_on = FALSE;
4878
4879 vm_pressure_klist_lock();
4880 vm_reset_active_list();
4881 vm_pressure_klist_unlock();
4882 } else {
4883
4884 vm_pressure_klist_lock();
fe8ab488 4885 vm_pressure_notification_without_levels(FALSE);
39236c6e
A
4886 vm_pressure_klist_unlock();
4887 }
4888
4889 return 0;
b0d623f7
A
4890}
4891
39236c6e
A
4892SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
4893 0, 0, &sysctl_memorypressure_manual_trigger, "I", "");
4894
4895
4896extern int memorystatus_purge_on_warning;
4897extern int memorystatus_purge_on_urgent;
4898extern int memorystatus_purge_on_critical;
4899
4900SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_warning, 0, "");
4901SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_urgent, 0, "");
4902SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_critical, 0, "");
4903
4904
fe8ab488 4905#endif /* VM_PRESSURE_EVENTS */
39236c6e
A
4906
4907/* Return both allocated and actual size, since there's a race between allocation and list compilation */
b0d623f7 4908static int
39236c6e 4909memorystatus_get_priority_list(memorystatus_priority_entry_t **list_ptr, size_t *buffer_size, size_t *list_size, boolean_t size_only)
b0d623f7 4910{
316670eb 4911 uint32_t list_count, i = 0;
39236c6e
A
4912 memorystatus_priority_entry_t *list_entry;
4913 proc_t p;
4914
316670eb 4915 list_count = memorystatus_list_count;
39236c6e
A
4916 *list_size = sizeof(memorystatus_priority_entry_t) * list_count;
4917
4918 /* Just a size check? */
4919 if (size_only) {
4920 return 0;
4921 }
4922
4923 /* Otherwise, validate the size of the buffer */
4924 if (*buffer_size < *list_size) {
4925 return EINVAL;
4926 }
4927
4928 *list_ptr = (memorystatus_priority_entry_t*)kalloc(*list_size);
4929 if (!list_ptr) {
316670eb
A
4930 return ENOMEM;
4931 }
4932
39236c6e
A
4933 memset(*list_ptr, 0, *list_size);
4934
4935 *buffer_size = *list_size;
4936 *list_size = 0;
4937
4938 list_entry = *list_ptr;
4939
4940 proc_list_lock();
4941
4942 p = memorystatus_get_first_proc_locked(&i, TRUE);
4943 while (p && (*list_size < *buffer_size)) {
4944 list_entry->pid = p->p_pid;
4945 list_entry->priority = p->p_memstat_effectivepriority;
4946 list_entry->user_data = p->p_memstat_userdata;
4947#if LEGACY_HIWATER
3e170ce0
A
4948
4949 /*
4950 * No need to consider P_MEMSTAT_MEMLIMIT_BACKGROUND anymore.
4951 * Background limits are described via the inactive limit slots.
4952 * So, here, the cached limit should always be valid.
4953 */
4954
4955 if (p->p_memstat_memlimit <= 0) {
4956 task_get_phys_footprint_limit(p->task, &list_entry->limit);
4957 } else {
4958 list_entry->limit = p->p_memstat_memlimit;
4959 }
39236c6e
A
4960#else
4961 task_get_phys_footprint_limit(p->task, &list_entry->limit);
4962#endif
4963 list_entry->state = memorystatus_build_state(p);
4964 list_entry++;
4965
4966 *list_size += sizeof(memorystatus_priority_entry_t);
4967
4968 p = memorystatus_get_next_proc_locked(&i, p, TRUE);
316670eb
A
4969 }
4970
39236c6e 4971 proc_list_unlock();
316670eb 4972
39236c6e 4973 MEMORYSTATUS_DEBUG(1, "memorystatus_get_priority_list: returning %lu for size\n", (unsigned long)*list_size);
316670eb 4974
39236c6e
A
4975 return 0;
4976}
b0d623f7 4977
39236c6e
A
4978static int
4979memorystatus_cmd_get_priority_list(user_addr_t buffer, size_t buffer_size, int32_t *retval) {
4980 int error = EINVAL;
4981 boolean_t size_only;
4982 memorystatus_priority_entry_t *list = NULL;
4983 size_t list_size;
316670eb 4984
39236c6e
A
4985 size_only = ((buffer == USER_ADDR_NULL) ? TRUE: FALSE);
4986
4987 error = memorystatus_get_priority_list(&list, &buffer_size, &list_size, size_only);
4988 if (error) {
4989 goto out;
4990 }
4991
4992 if (!size_only) {
4993 error = copyout(list, buffer, list_size);
4994 }
4995
4996 if (error == 0) {
4997 *retval = list_size;
4998 }
4999out:
5000
5001 if (list) {
5002 kfree(list, buffer_size);
5003 }
5004
5005 return error;
316670eb 5006}
b0d623f7 5007
39236c6e
A
5008#if CONFIG_JETSAM
5009
5010static void
5011memorystatus_clear_errors(void)
5012{
5013 proc_t p;
5014 unsigned int i = 0;
5015
5016 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_START, 0, 0, 0, 0, 0);
5017
5018 proc_list_lock();
5019
5020 p = memorystatus_get_first_proc_locked(&i, TRUE);
5021 while (p) {
5022 if (p->p_memstat_state & P_MEMSTAT_ERROR) {
5023 p->p_memstat_state &= ~P_MEMSTAT_ERROR;
5024 }
5025 p = memorystatus_get_next_proc_locked(&i, p, TRUE);
5026 }
5027
5028 proc_list_unlock();
5029
5030 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_END, 0, 0, 0, 0, 0);
5031}
b0d623f7 5032
316670eb 5033static void
39236c6e 5034memorystatus_update_levels_locked(boolean_t critical_only) {
fe8ab488 5035
39236c6e 5036 memorystatus_available_pages_critical = memorystatus_available_pages_critical_base;
fe8ab488
A
5037
5038 /*
5039 * If there's an entry in the first bucket, we have idle processes.
5040 */
5041 memstat_bucket_t *first_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
5042 if (first_bucket->count) {
5043 memorystatus_available_pages_critical += memorystatus_available_pages_critical_idle_offset;
5044
5045 if (memorystatus_available_pages_critical > memorystatus_available_pages_pressure ) {
5046 /*
5047 * The critical threshold must never exceed the pressure threshold
5048 */
5049 memorystatus_available_pages_critical = memorystatus_available_pages_pressure;
39236c6e
A
5050 }
5051 }
fe8ab488 5052
316670eb
A
5053#if DEBUG || DEVELOPMENT
5054 if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
5055 memorystatus_available_pages_critical += memorystatus_jetsam_policy_offset_pages_diagnostic;
fe8ab488
A
5056
5057 if (memorystatus_available_pages_critical > memorystatus_available_pages_pressure ) {
5058 /*
5059 * The critical threshold must never exceed the pressure threshold
5060 */
5061 memorystatus_available_pages_critical = memorystatus_available_pages_pressure;
5062 }
39236c6e
A
5063 }
5064#endif
5065
5066 if (critical_only) {
5067 return;
5068 }
5069
316670eb 5070#if VM_PRESSURE_EVENTS
39236c6e
A
5071 memorystatus_available_pages_pressure = (pressure_threshold_percentage / delta_percentage) * memorystatus_delta;
5072#if DEBUG || DEVELOPMENT
5073 if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
316670eb 5074 memorystatus_available_pages_pressure += memorystatus_jetsam_policy_offset_pages_diagnostic;
316670eb
A
5075 }
5076#endif
39236c6e
A
5077#endif
5078}
5079
3e170ce0
A
5080/*
5081 * Get the at_boot snapshot
5082 */
39236c6e 5083static int
3e170ce0 5084memorystatus_get_at_boot_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only) {
39236c6e 5085 size_t input_size = *snapshot_size;
3e170ce0
A
5086
5087 /*
5088 * The at_boot snapshot has no entry list.
5089 */
5090 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t);
5091
5092 if (size_only) {
5093 return 0;
5094 }
5095
5096 /*
5097 * Validate the size of the snapshot buffer
5098 */
5099 if (input_size < *snapshot_size) {
5100 return EINVAL;
5101 }
5102
5103 /*
5104 * Update the notification_time only
5105 */
5106 memorystatus_at_boot_snapshot.notification_time = mach_absolute_time();
5107 *snapshot = &memorystatus_at_boot_snapshot;
5108
5109 MEMORYSTATUS_DEBUG(7, "memorystatus_get_at_boot_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%d)\n",
5110 (long)input_size, (long)*snapshot_size, 0);
5111 return 0;
5112}
5113
5114static int
5115memorystatus_get_on_demand_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only) {
5116 size_t input_size = *snapshot_size;
5117 uint32_t ods_list_count = memorystatus_list_count;
5118 memorystatus_jetsam_snapshot_t *ods = NULL; /* The on_demand snapshot buffer */
5119
5120 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (ods_list_count));
5121
5122 if (size_only) {
5123 return 0;
5124 }
5125
5126 /*
5127 * Validate the size of the snapshot buffer.
5128 * This is inherently racey. May want to revisit
5129 * this error condition and trim the output when
5130 * it doesn't fit.
5131 */
5132 if (input_size < *snapshot_size) {
5133 return EINVAL;
5134 }
5135
5136 /*
5137 * Allocate and initialize a snapshot buffer.
5138 */
5139 ods = (memorystatus_jetsam_snapshot_t *)kalloc(*snapshot_size);
5140 if (!ods) {
5141 return (ENOMEM);
5142 }
5143
5144 memset(ods, 0, *snapshot_size);
5145
5146 proc_list_lock();
5147 memorystatus_init_jetsam_snapshot_locked(ods, ods_list_count);
5148 proc_list_unlock();
5149
5150 /*
5151 * Return the kernel allocated, on_demand buffer.
5152 * The caller of this routine will copy the data out
5153 * to user space and then free the kernel allocated
5154 * buffer.
5155 */
5156 *snapshot = ods;
5157
5158 MEMORYSTATUS_DEBUG(7, "memorystatus_get_on_demand_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
5159 (long)input_size, (long)*snapshot_size, (long)ods_list_count);
316670eb 5160
3e170ce0
A
5161 return 0;
5162}
5163
5164static int
5165memorystatus_get_jetsam_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only) {
5166 size_t input_size = *snapshot_size;
5167
39236c6e
A
5168 if (memorystatus_jetsam_snapshot_count > 0) {
5169 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count));
5170 } else {
5171 *snapshot_size = 0;
5172 }
5173
5174 if (size_only) {
5175 return 0;
316670eb 5176 }
39236c6e
A
5177
5178 if (input_size < *snapshot_size) {
5179 return EINVAL;
5180 }
5181
5182 *snapshot = memorystatus_jetsam_snapshot;
3e170ce0
A
5183
5184 MEMORYSTATUS_DEBUG(7, "memorystatus_get_jetsam_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
5185 (long)input_size, (long)*snapshot_size, (long)memorystatus_jetsam_snapshot_count);
5186
39236c6e 5187 return 0;
316670eb
A
5188}
5189
fe8ab488 5190
316670eb 5191static int
3e170ce0 5192memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t buffer_size, int32_t *retval) {
39236c6e
A
5193 int error = EINVAL;
5194 boolean_t size_only;
3e170ce0
A
5195 boolean_t is_default_snapshot = FALSE;
5196 boolean_t is_on_demand_snapshot = FALSE;
5197 boolean_t is_at_boot_snapshot = FALSE;
39236c6e 5198 memorystatus_jetsam_snapshot_t *snapshot;
3e170ce0 5199
39236c6e 5200 size_only = ((buffer == USER_ADDR_NULL) ? TRUE : FALSE);
3e170ce0
A
5201
5202 if (flags == 0) {
5203 /* Default */
5204 is_default_snapshot = TRUE;
5205 error = memorystatus_get_jetsam_snapshot(&snapshot, &buffer_size, size_only);
5206 } else {
5207 if (flags & ~(MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT)) {
5208 /*
5209 * Unsupported bit set in flag.
5210 */
5211 return EINVAL;
5212 }
5213
5214 if ((flags & (MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT)) ==
5215 (MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT)) {
5216 /*
5217 * Can't have both set at the same time.
5218 */
5219 return EINVAL;
5220 }
5221
5222 if (flags & MEMORYSTATUS_SNAPSHOT_ON_DEMAND) {
5223 is_on_demand_snapshot = TRUE;
5224 /*
5225 * When not requesting the size only, the following call will allocate
5226 * an on_demand snapshot buffer, which is freed below.
5227 */
5228 error = memorystatus_get_on_demand_snapshot(&snapshot, &buffer_size, size_only);
5229
5230 } else if (flags & MEMORYSTATUS_SNAPSHOT_AT_BOOT) {
5231 is_at_boot_snapshot = TRUE;
5232 error = memorystatus_get_at_boot_snapshot(&snapshot, &buffer_size, size_only);
5233 } else {
5234 /*
5235 * Invalid flag setting.
5236 */
5237 return EINVAL;
5238 }
5239 }
5240
39236c6e
A
5241 if (error) {
5242 goto out;
5243 }
316670eb 5244
3e170ce0
A
5245 /*
5246 * Copy the data out to user space and clear the snapshot buffer.
5247 * If working with the jetsam snapshot,
5248 * clearing the buffer means, reset the count.
5249 * If working with an on_demand snapshot
5250 * clearing the buffer means, free it.
5251 * If working with the at_boot snapshot
5252 * there is nothing to clear or update.
5253 */
39236c6e
A
5254 if (!size_only) {
5255 if ((error = copyout(snapshot, buffer, buffer_size)) == 0) {
3e170ce0
A
5256 if (is_default_snapshot) {
5257 /*
5258 * The jetsam snapshot is never freed, its count is simply reset.
5259 */
5260 snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
5261
5262 proc_list_lock();
5263 memorystatus_jetsam_snapshot_last_timestamp = 0;
5264 proc_list_unlock();
5265 }
5266 }
5267
5268 if (is_on_demand_snapshot) {
5269 /*
5270 * The on_demand snapshot is always freed,
5271 * even if the copyout failed.
5272 */
5273 if(snapshot) {
5274 kfree(snapshot, buffer_size);
5275 }
39236c6e
A
5276 }
5277 }
316670eb 5278
39236c6e
A
5279 if (error == 0) {
5280 *retval = buffer_size;
5281 }
5282out:
5283 return error;
5284}
316670eb 5285
fe8ab488
A
5286/*
5287 * Routine: memorystatus_cmd_grp_set_properties
5288 * Purpose: Update properties for a group of processes.
5289 *
5290 * Supported Properties:
5291 * [priority]
5292 * Move each process out of its effective priority
5293 * band and into a new priority band.
5294 * Maintains relative order from lowest to highest priority.
5295 * In single band, maintains relative order from head to tail.
5296 *
5297 * eg: before [effectivepriority | pid]
5298 * [18 | p101 ]
5299 * [17 | p55, p67, p19 ]
5300 * [12 | p103 p10 ]
5301 * [ 7 | p25 ]
5302 * [ 0 | p71, p82, ]
5303 *
5304 * after [ new band | pid]
5305 * [ xxx | p71, p82, p25, p103, p10, p55, p67, p19, p101]
5306 *
5307 * Returns: 0 on success, else non-zero.
5308 *
5309 * Caveat: We know there is a race window regarding recycled pids.
5310 * A process could be killed before the kernel can act on it here.
5311 * If a pid cannot be found in any of the jetsam priority bands,
5312 * then we simply ignore it. No harm.
5313 * But, if the pid has been recycled then it could be an issue.
5314 * In that scenario, we might move an unsuspecting process to the new
5315 * priority band. It's not clear how the kernel can safeguard
5316 * against this, but it would be an extremely rare case anyway.
5317 * The caller of this api might avoid such race conditions by
5318 * ensuring that the processes passed in the pid list are suspended.
5319 */
5320
5321
5322/* This internal structure can expand when we add support for more properties */
5323typedef struct memorystatus_internal_properties
5324{
5325 proc_t proc;
5326 int32_t priority; /* see memorytstatus_priority_entry_t : priority */
5327} memorystatus_internal_properties_t;
5328
5329
5330static int
5331memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
5332
5333#pragma unused (flags)
5334
5335 /*
5336 * We only handle setting priority
5337 * per process
5338 */
5339
5340 int error = 0;
5341 memorystatus_priority_entry_t *entries = NULL;
5342 uint32_t entry_count = 0;
5343
5344 /* This will be the ordered proc list */
5345 memorystatus_internal_properties_t *table = NULL;
5346 size_t table_size = 0;
5347 uint32_t table_count = 0;
5348
5349 uint32_t i = 0;
5350 uint32_t bucket_index = 0;
5351 boolean_t head_insert;
5352 int32_t new_priority;
5353
5354 proc_t p;
5355
5356 /* Verify inputs */
5357 if ((buffer == USER_ADDR_NULL) || (buffer_size == 0) || ((buffer_size % sizeof(memorystatus_priority_entry_t)) != 0)) {
5358 error = EINVAL;
5359 goto out;
5360 }
5361
5362 entry_count = (buffer_size / sizeof(memorystatus_priority_entry_t));
5363 if ((entries = (memorystatus_priority_entry_t *)kalloc(buffer_size)) == NULL) {
5364 error = ENOMEM;
5365 goto out;
5366 }
5367
5368 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, entry_count, 0, 0, 0, 0);
5369
5370 if ((error = copyin(buffer, entries, buffer_size)) != 0) {
5371 goto out;
5372 }
5373
5374 /* Verify sanity of input priorities */
5375 for (i=0; i < entry_count; i++) {
5376 if (entries[i].priority == -1) {
5377 /* Use as shorthand for default priority */
5378 entries[i].priority = JETSAM_PRIORITY_DEFAULT;
5379 } else if (entries[i].priority == JETSAM_PRIORITY_IDLE_DEFERRED) {
5380 /* JETSAM_PRIORITY_IDLE_DEFERRED is reserved for internal use;
5381 * if requested, adjust to JETSAM_PRIORITY_IDLE. */
5382 entries[i].priority = JETSAM_PRIORITY_IDLE;
5383 } else if (entries[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
5384 /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle
5385 * queue */
5386 /* Deal with this later */
5387 } else if ((entries[i].priority < 0) || (entries[i].priority >= MEMSTAT_BUCKET_COUNT)) {
5388 /* Sanity check */
5389 error = EINVAL;
5390 goto out;
5391 }
5392 }
5393
5394 table_size = sizeof(memorystatus_internal_properties_t) * entry_count;
5395 if ( (table = (memorystatus_internal_properties_t *)kalloc(table_size)) == NULL) {
5396 error = ENOMEM;
5397 goto out;
5398 }
5399 memset(table, 0, table_size);
5400
5401
5402 /*
5403 * For each jetsam bucket entry, spin through the input property list.
5404 * When a matching pid is found, populate an adjacent table with the
5405 * appropriate proc pointer and new property values.
5406 * This traversal automatically preserves order from lowest
5407 * to highest priority.
5408 */
5409
5410 bucket_index=0;
5411
5412 proc_list_lock();
5413
5414 /* Create the ordered table */
5415 p = memorystatus_get_first_proc_locked(&bucket_index, TRUE);
5416 while (p && (table_count < entry_count)) {
5417 for (i=0; i < entry_count; i++ ) {
5418 if (p->p_pid == entries[i].pid) {
5419 /* Build the table data */
5420 table[table_count].proc = p;
5421 table[table_count].priority = entries[i].priority;
5422 table_count++;
5423 break;
5424 }
5425 }
5426 p = memorystatus_get_next_proc_locked(&bucket_index, p, TRUE);
5427 }
5428
5429 /* We now have ordered list of procs ready to move */
5430 for (i=0; i < table_count; i++) {
5431 p = table[i].proc;
5432 assert(p != NULL);
5433
5434 /* Allow head inserts -- but relative order is now */
5435 if (table[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
5436 new_priority = JETSAM_PRIORITY_IDLE;
5437 head_insert = true;
5438 } else {
5439 new_priority = table[i].priority;
5440 head_insert = false;
5441 }
5442
5443 /* Not allowed */
5444 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
5445 continue;
5446 }
5447
5448 /*
5449 * Take appropriate steps if moving proc out of the
5450 * JETSAM_PRIORITY_IDLE_DEFERRED band.
5451 */
5452 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
5453 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
5454 }
5455
5456 memorystatus_update_priority_locked(p, new_priority, head_insert);
5457 }
5458
5459 proc_list_unlock();
5460
5461 /*
5462 * if (table_count != entry_count)
5463 * then some pids were not found in a jetsam band.
5464 * harmless but interesting...
5465 */
5466 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, entry_count, table_count, 0, 0, 0);
5467
5468out:
5469 if (entries)
5470 kfree(entries, buffer_size);
5471 if (table)
5472 kfree(table, table_size);
5473
5474 return (error);
5475}
5476
5477
5478/*
3e170ce0
A
5479 * This routine is used to update a process's jetsam priority position and stored user_data.
5480 * It is not used for the setting of memory limits, which is why the last 6 args to the
5481 * memorystatus_update() call are 0 or FALSE.
fe8ab488
A
5482 */
5483
39236c6e
A
5484static int
5485memorystatus_cmd_set_priority_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
3e170ce0
A
5486 int error = 0;
5487 memorystatus_priority_properties_t mpp_entry;
5488
39236c6e 5489 /* Validate inputs */
3e170ce0 5490 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_priority_properties_t))) {
39236c6e
A
5491 return EINVAL;
5492 }
5493
3e170ce0
A
5494 error = copyin(buffer, &mpp_entry, buffer_size);
5495
5496 if (error == 0) {
39236c6e
A
5497 proc_t p;
5498
39236c6e
A
5499 p = proc_find(pid);
5500 if (!p) {
3e170ce0 5501 return ESRCH;
39236c6e
A
5502 }
5503
5504 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
39236c6e 5505 proc_rele(p);
3e170ce0 5506 return EPERM;
39236c6e 5507 }
fe8ab488 5508
3e170ce0 5509 error = memorystatus_update(p, mpp_entry.priority, mpp_entry.user_data, FALSE, FALSE, 0, 0, FALSE, FALSE, FALSE);
39236c6e
A
5510 proc_rele(p);
5511 }
5512
3e170ce0
A
5513 return(error);
5514}
5515
5516static int
5517memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
5518 int error = 0;
5519 memorystatus_memlimit_properties_t mmp_entry;
5520
5521 /* Validate inputs */
5522 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_memlimit_properties_t))) {
5523 return EINVAL;
5524 }
5525
5526 error = copyin(buffer, &mmp_entry, buffer_size);
5527
5528 if (error == 0) {
5529 error = memorystatus_set_memlimit_properties(pid, &mmp_entry);
5530 }
5531
5532 return(error);
5533}
5534
5535/*
5536 * When getting the memlimit settings, we can't simply call task_get_phys_footprint_limit().
5537 * That gets the proc's cached memlimit and there is no guarantee that the active/inactive
5538 * limits will be the same in the no-limit case. Instead we convert limits <= 0 using
5539 * task_convert_phys_footprint_limit(). It computes the same limit value that would be written
5540 * to the task's ledgers via task_set_phys_footprint_limit().
5541 */
5542static int
5543memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
5544 int error = 0;
5545 memorystatus_memlimit_properties_t mmp_entry;
5546
5547 /* Validate inputs */
5548 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_memlimit_properties_t))) {
5549 return EINVAL;
5550 }
5551
5552 memset (&mmp_entry, 0, sizeof(memorystatus_memlimit_properties_t));
5553
5554 proc_t p = proc_find(pid);
5555 if (!p) {
5556 return ESRCH;
5557 }
5558
5559 /*
5560 * Get the active limit and attributes.
5561 * No locks taken since we hold a reference to the proc.
5562 */
5563
5564 if (p->p_memstat_memlimit_active > 0 ) {
5565 mmp_entry.memlimit_active = p->p_memstat_memlimit_active;
5566 } else {
5567 task_convert_phys_footprint_limit(-1, &mmp_entry.memlimit_active);
5568 }
5569
5570 if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL) {
5571 mmp_entry.memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
5572 }
5573
5574 /*
5575 * Get the inactive limit and attributes
5576 */
5577 if (p->p_memstat_memlimit_inactive <= 0) {
5578 task_convert_phys_footprint_limit(-1, &mmp_entry.memlimit_inactive);
5579 } else {
5580 mmp_entry.memlimit_inactive = p->p_memstat_memlimit_inactive;
5581 }
5582 if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) {
5583 mmp_entry.memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
5584 }
5585 proc_rele(p);
5586
5587 error = copyout(&mmp_entry, buffer, buffer_size);
5588
5589 return(error);
b0d623f7
A
5590}
5591
3e170ce0 5592
39236c6e
A
5593static int
5594memorystatus_cmd_get_pressure_status(int32_t *retval) {
5595 int error;
5596
5597 /* Need privilege for check */
5598 error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
5599 if (error) {
5600 return (error);
5601 }
5602
5603 /* Inherently racy, so it's not worth taking a lock here */
5604 *retval = (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
5605
5606 return error;
5607}
316670eb 5608
3e170ce0
A
5609int
5610memorystatus_get_pressure_status_kdp() {
5611 return (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
5612}
5613
fe8ab488
A
5614/*
5615 * Every process, including a P_MEMSTAT_INTERNAL process (currently only pid 1), is allowed to set a HWM.
3e170ce0
A
5616 *
5617 * This call is inflexible -- it does not distinguish between active/inactive, fatal/non-fatal
5618 * So, with 2-level HWM preserving previous behavior will map as follows.
5619 * - treat the limit passed in as both an active and inactive limit.
5620 * - treat the is_fatal_limit flag as though it applies to both active and inactive limits.
5621 *
5622 * When invoked via MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK
5623 * - the is_fatal_limit is FALSE, meaning the active and inactive limits are non-fatal/soft
5624 * - so mapping is (active/non-fatal, inactive/non-fatal)
5625 *
5626 * When invoked via MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT
5627 * - the is_fatal_limit is TRUE, meaning the process's active and inactive limits are fatal/hard
5628 * - so mapping is (active/fatal, inactive/fatal)
fe8ab488
A
5629 */
5630
b0d623f7 5631static int
fe8ab488 5632memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit) {
39236c6e 5633 int error = 0;
3e170ce0
A
5634 memorystatus_memlimit_properties_t entry;
5635
5636 entry.memlimit_active = high_water_mark;
5637 entry.memlimit_active_attr = 0;
5638 entry.memlimit_inactive = high_water_mark;
5639 entry.memlimit_inactive_attr = 0;
5640
5641 if (is_fatal_limit == TRUE) {
5642 entry.memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
5643 entry.memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
5644 }
5645
5646 error = memorystatus_set_memlimit_properties(pid, &entry);
5647 return (error);
5648}
5649
5650static int
5651memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry) {
5652
5653 int32_t memlimit_active;
5654 boolean_t memlimit_active_is_fatal;
5655 int32_t memlimit_inactive;
5656 boolean_t memlimit_inactive_is_fatal;
5657 uint32_t valid_attrs = 0;
5658 int error = 0;
39236c6e
A
5659
5660 proc_t p = proc_find(pid);
5661 if (!p) {
5662 return ESRCH;
5663 }
3e170ce0
A
5664
5665 /*
5666 * Check for valid attribute flags.
5667 */
5668 valid_attrs |= (MEMORYSTATUS_MEMLIMIT_ATTR_FATAL);
5669 if ((entry->memlimit_active_attr & (~valid_attrs)) != 0) {
5670 proc_rele(p);
5671 return EINVAL;
5672 }
5673 if ((entry->memlimit_inactive_attr & (~valid_attrs)) != 0) {
5674 proc_rele(p);
5675 return EINVAL;
39236c6e 5676 }
fe8ab488 5677
3e170ce0
A
5678 /*
5679 * Setup the active memlimit properties
5680 */
5681 memlimit_active = entry->memlimit_active;
5682 if (entry->memlimit_active_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL) {
5683 memlimit_active_is_fatal = TRUE;
5684 } else {
5685 memlimit_active_is_fatal = FALSE;
5686 }
fe8ab488 5687
3e170ce0
A
5688 /*
5689 * Setup the inactive memlimit properties
5690 */
5691 memlimit_inactive = entry->memlimit_inactive;
5692 if (entry->memlimit_inactive_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL) {
5693 memlimit_inactive_is_fatal = TRUE;
5694 } else {
5695 memlimit_inactive_is_fatal = FALSE;
39236c6e
A
5696 }
5697
3e170ce0
A
5698 /*
5699 * Setting a limit of <= 0 implies that the process has no
5700 * high-water-mark and has no per-task-limit. That means
5701 * the system_wide task limit is in place, which by the way,
5702 * is always fatal.
5703 */
5704
5705 if (memlimit_active <= 0) {
5706 /*
5707 * Enforce the fatal system_wide task limit while process is active.
5708 */
5709 memlimit_active = -1;
5710 memlimit_active_is_fatal = TRUE;
5711 }
5712
5713 if (memlimit_inactive <= 0) {
5714 /*
5715 * Enforce the fatal system_wide task limit while process is inactive.
5716 */
5717 memlimit_inactive = -1;
5718 memlimit_inactive_is_fatal = TRUE;
5719 }
5720
5721 proc_list_lock();
5722
5723 /*
5724 * Store the active limit variants in the proc.
5725 */
5726 SET_ACTIVE_LIMITS_LOCKED(p, memlimit_active, memlimit_active_is_fatal);
5727
5728 /*
5729 * Store the inactive limit variants in the proc.
5730 */
5731 SET_INACTIVE_LIMITS_LOCKED(p, memlimit_inactive, memlimit_inactive_is_fatal);
5732
5733 /*
5734 * Enforce appropriate limit variant by updating the cached values
5735 * and writing the ledger.
5736 * Limit choice is based on process active/inactive state.
5737 */
5738
5739 if (memorystatus_highwater_enabled) {
5740 boolean_t trigger_exception;
5741 /*
5742 * No need to consider P_MEMSTAT_MEMLIMIT_BACKGROUND anymore.
5743 * Background limits are described via the inactive limit slots.
5744 */
5745
5746 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
5747 CACHE_ACTIVE_LIMITS_LOCKED(p, trigger_exception);
fe8ab488 5748 } else {
3e170ce0 5749 CACHE_INACTIVE_LIMITS_LOCKED(p, trigger_exception);
fe8ab488 5750 }
3e170ce0
A
5751
5752 /* Enforce the limit by writing to the ledgers */
5753 assert(trigger_exception == TRUE);
5754 error = (task_set_phys_footprint_limit_internal(p->task, ((p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1), NULL, trigger_exception) == 0) ? 0 : EINVAL;
5755
5756 MEMORYSTATUS_DEBUG(3, "memorystatus_set_memlimit_properties: new limit on pid %d (%dMB %s) current priority (%d) dirty_state?=0x%x %s\n",
5757 p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
5758 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, p->p_memstat_dirty,
5759 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
fe8ab488
A
5760 }
5761
39236c6e
A
5762 proc_list_unlock();
5763 proc_rele(p);
5764
5765 return error;
5766}
5767
fe8ab488
A
5768/*
5769 * Returns the jetsam priority (effective or requested) of the process
5770 * associated with this task.
5771 */
5772int
5773proc_get_memstat_priority(proc_t p, boolean_t effective_priority)
5774{
5775 if (p) {
5776 if (effective_priority) {
5777 return p->p_memstat_effectivepriority;
5778 } else {
5779 return p->p_memstat_requestedpriority;
5780 }
5781 }
5782 return 0;
5783}
3e170ce0
A
5784
5785/*
5786 * Description:
5787 * Evaluates active vs. inactive process state.
5788 * Processes that opt into dirty tracking are evaluated
5789 * based on clean vs dirty state.
5790 * dirty ==> active
5791 * clean ==> inactive
5792 *
5793 * Process that do not opt into dirty tracking are
5794 * evalulated based on priority level.
5795 * Foreground or above ==> active
5796 * Below Foreground ==> inactive
5797 *
5798 * Return: TRUE if active
5799 * False if inactive
5800 */
5801
5802static boolean_t
5803proc_jetsam_state_is_active_locked(proc_t p) {
5804
5805 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
5806 /*
5807 * process has opted into dirty tracking
5808 * active state is based on dirty vs. clean
5809 */
5810 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
5811 /*
5812 * process is dirty
5813 * implies active state
5814 */
5815 return TRUE;
5816 } else {
5817 /*
5818 * process is clean
5819 * implies inactive state
5820 */
5821 return FALSE;
5822 }
5823 } else if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) {
5824 /*
5825 * process is Foreground or higher
5826 * implies active state
5827 */
5828 return TRUE;
5829 } else {
5830 /*
5831 * process found below Foreground
5832 * implies inactive state
5833 */
5834 return FALSE;
5835 }
5836}
5837
39236c6e 5838#endif /* CONFIG_JETSAM */
b0d623f7 5839
39236c6e
A
5840int
5841memorystatus_control(struct proc *p __unused, struct memorystatus_control_args *args, int *ret) {
5842 int error = EINVAL;
5843
5844#if !CONFIG_JETSAM
5845 #pragma unused(ret)
5846#endif
5847
5848 /* Root only for now */
5849 if (!kauth_cred_issuser(kauth_cred_get())) {
5850 error = EPERM;
5851 goto out;
b0d623f7 5852 }
39236c6e
A
5853
5854 /* Sanity check */
5855 if (args->buffersize > MEMORYSTATUS_BUFFERSIZE_MAX) {
5856 error = EINVAL;
5857 goto out;
5858 }
5859
5860 switch (args->command) {
5861 case MEMORYSTATUS_CMD_GET_PRIORITY_LIST:
5862 error = memorystatus_cmd_get_priority_list(args->buffer, args->buffersize, ret);
5863 break;
5864#if CONFIG_JETSAM
5865 case MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES:
5866 error = memorystatus_cmd_set_priority_properties(args->pid, args->buffer, args->buffersize, ret);
5867 break;
3e170ce0
A
5868 case MEMORYSTATUS_CMD_SET_MEMLIMIT_PROPERTIES:
5869 error = memorystatus_cmd_set_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
5870 break;
5871 case MEMORYSTATUS_CMD_GET_MEMLIMIT_PROPERTIES:
5872 error = memorystatus_cmd_get_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
5873 break;
fe8ab488
A
5874 case MEMORYSTATUS_CMD_GRP_SET_PROPERTIES:
5875 error = memorystatus_cmd_grp_set_properties((int32_t)args->flags, args->buffer, args->buffersize, ret);
5876 break;
39236c6e 5877 case MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT:
3e170ce0 5878 error = memorystatus_cmd_get_jetsam_snapshot((int32_t)args->flags, args->buffer, args->buffersize, ret);
39236c6e
A
5879 break;
5880 case MEMORYSTATUS_CMD_GET_PRESSURE_STATUS:
5881 error = memorystatus_cmd_get_pressure_status(ret);
5882 break;
5883 case MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK:
3e170ce0
A
5884 /*
5885 * This call does not distinguish between active and inactive limits.
5886 * Default behavior in 2-level HWM world is to set both.
5887 * Non-fatal limit is also assumed for both.
5888 */
fe8ab488
A
5889 error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, FALSE);
5890 break;
5891 case MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT:
3e170ce0
A
5892 /*
5893 * This call does not distinguish between active and inactive limits.
5894 * Default behavior in 2-level HWM world is to set both.
5895 * Fatal limit is also assumed for both.
5896 */
fe8ab488 5897 error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, TRUE);
39236c6e
A
5898 break;
5899 /* Test commands */
5900#if DEVELOPMENT || DEBUG
5901 case MEMORYSTATUS_CMD_TEST_JETSAM:
5902 error = memorystatus_kill_process_sync(args->pid, kMemorystatusKilled) ? 0 : EINVAL;
5903 break;
3e170ce0
A
5904 case MEMORYSTATUS_CMD_TEST_JETSAM_SORT:
5905 error = memorystatus_cmd_test_jetsam_sort(args->pid, (int32_t)args->flags);
5906 break;
39236c6e
A
5907 case MEMORYSTATUS_CMD_SET_JETSAM_PANIC_BITS:
5908 error = memorystatus_cmd_set_panic_bits(args->buffer, args->buffersize);
5909 break;
5910#endif /* DEVELOPMENT || DEBUG */
5911#endif /* CONFIG_JETSAM */
3e170ce0
A
5912 case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE:
5913 case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE:
5914 error = memorystatus_low_mem_privileged_listener(args->command);
5915 break;
39236c6e
A
5916 default:
5917 break;
5918 }
5919
5920out:
5921 return error;
5922}
5923
5924
5925static int
5926filt_memorystatusattach(struct knote *kn)
5927{
5928 kn->kn_flags |= EV_CLEAR;
5929 return memorystatus_knote_register(kn);
5930}
5931
5932static void
5933filt_memorystatusdetach(struct knote *kn)
5934{
5935 memorystatus_knote_unregister(kn);
5936}
5937
5938static int
5939filt_memorystatus(struct knote *kn __unused, long hint)
5940{
5941 if (hint) {
5942 switch (hint) {
5943 case kMemorystatusNoPressure:
5944 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
3e170ce0 5945 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
39236c6e
A
5946 }
5947 break;
5948 case kMemorystatusPressure:
5949 if (memorystatus_vm_pressure_level == kVMPressureWarning || memorystatus_vm_pressure_level == kVMPressureUrgent) {
5950 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
3e170ce0 5951 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
39236c6e
A
5952 }
5953 } else if (memorystatus_vm_pressure_level == kVMPressureCritical) {
5954
5955 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
3e170ce0 5956 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
39236c6e
A
5957 }
5958 }
5959 break;
fe8ab488
A
5960 case kMemorystatusLowSwap:
5961 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) {
3e170ce0 5962 kn->kn_fflags = NOTE_MEMORYSTATUS_LOW_SWAP;
fe8ab488
A
5963 }
5964 break;
39236c6e
A
5965 default:
5966 break;
b0d623f7 5967 }
39236c6e
A
5968 }
5969
5970 return (kn->kn_fflags != 0);
5971}
5972
5973static void
5974memorystatus_klist_lock(void) {
5975 lck_mtx_lock(&memorystatus_klist_mutex);
5976}
5977
5978static void
5979memorystatus_klist_unlock(void) {
5980 lck_mtx_unlock(&memorystatus_klist_mutex);
5981}
5982
5983void
5984memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr) {
5985 lck_mtx_init(&memorystatus_klist_mutex, grp, attr);
5986 klist_init(&memorystatus_klist);
5987}
5988
5989int
5990memorystatus_knote_register(struct knote *kn) {
5991 int error = 0;
5992
5993 memorystatus_klist_lock();
5994
fe8ab488 5995 if (kn->kn_sfflags & (NOTE_MEMORYSTATUS_PRESSURE_NORMAL | NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL | NOTE_MEMORYSTATUS_LOW_SWAP)) {
39236c6e 5996
3e170ce0 5997 KNOTE_ATTACH(&memorystatus_klist, kn);
39236c6e 5998
39236c6e
A
5999 } else {
6000 error = ENOTSUP;
b0d623f7 6001 }
39236c6e
A
6002
6003 memorystatus_klist_unlock();
6004
6005 return error;
b0d623f7
A
6006}
6007
39236c6e
A
6008void
6009memorystatus_knote_unregister(struct knote *kn __unused) {
6010 memorystatus_klist_lock();
6011 KNOTE_DETACH(&memorystatus_klist, kn);
6012 memorystatus_klist_unlock();
6013}
316670eb 6014
fe8ab488
A
6015
6016#if 0
39236c6e
A
6017#if CONFIG_JETSAM && VM_PRESSURE_EVENTS
6018static boolean_t
6019memorystatus_issue_pressure_kevent(boolean_t pressured) {
6020 memorystatus_klist_lock();
6021 KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure);
6022 memorystatus_klist_unlock();
6023 return TRUE;
6024}
39236c6e 6025#endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
fe8ab488 6026#endif /* 0 */
3e170ce0
A
6027
6028#if CONFIG_JETSAM
6029/* Coalition support */
6030
6031/* sorting info for a particular priority bucket */
6032typedef struct memstat_sort_info {
6033 coalition_t msi_coal;
6034 uint64_t msi_page_count;
6035 pid_t msi_pid;
6036 int msi_ntasks;
6037} memstat_sort_info_t;
6038
6039/*
6040 * qsort from smallest page count to largest page count
6041 *
6042 * return < 0 for a < b
6043 * 0 for a == b
6044 * > 0 for a > b
6045 */
6046static int memstat_asc_cmp(const void *a, const void *b)
6047{
6048 const memstat_sort_info_t *msA = (const memstat_sort_info_t *)a;
6049 const memstat_sort_info_t *msB = (const memstat_sort_info_t *)b;
6050
6051 return (int)((uint64_t)msA->msi_page_count - (uint64_t)msB->msi_page_count);
6052}
6053
6054/*
6055 * Return the number of pids rearranged during this sort.
6056 */
6057static int
6058memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coal_sort_order)
6059{
6060#define MAX_SORT_PIDS 80
6061#define MAX_COAL_LEADERS 10
6062
6063 unsigned int b = bucket_index;
6064 int nleaders = 0;
6065 int ntasks = 0;
6066 proc_t p = NULL;
6067 coalition_t coal = COALITION_NULL;
6068 int pids_moved = 0;
6069 int total_pids_moved = 0;
6070 int i;
6071
6072 /*
6073 * The system is typically under memory pressure when in this
6074 * path, hence, we want to avoid dynamic memory allocation.
6075 */
6076 memstat_sort_info_t leaders[MAX_COAL_LEADERS];
6077 pid_t pid_list[MAX_SORT_PIDS];
6078
6079 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
6080 return(0);
6081 }
6082
6083 /*
6084 * Clear the array that holds coalition leader information
6085 */
6086 for (i=0; i < MAX_COAL_LEADERS; i++) {
6087 leaders[i].msi_coal = COALITION_NULL;
6088 leaders[i].msi_page_count = 0; /* will hold total coalition page count */
6089 leaders[i].msi_pid = 0; /* will hold coalition leader pid */
6090 leaders[i].msi_ntasks = 0; /* will hold the number of tasks in a coalition */
6091 }
6092
6093 p = memorystatus_get_first_proc_locked(&b, FALSE);
6094 while (p) {
6095 if (coalition_is_leader(p->task, COALITION_TYPE_JETSAM, &coal)) {
6096 if (nleaders < MAX_COAL_LEADERS) {
6097 int coal_ntasks = 0;
6098 uint64_t coal_page_count = coalition_get_page_count(coal, &coal_ntasks);
6099 leaders[nleaders].msi_coal = coal;
6100 leaders[nleaders].msi_page_count = coal_page_count;
6101 leaders[nleaders].msi_pid = p->p_pid; /* the coalition leader */
6102 leaders[nleaders].msi_ntasks = coal_ntasks;
6103 nleaders++;
6104 } else {
6105 /*
6106 * We've hit MAX_COAL_LEADERS meaning we can handle no more coalitions.
6107 * Abandoned coalitions will linger at the tail of the priority band
6108 * when this sort session ends.
6109 * TODO: should this be an assert?
6110 */
6111 printf("%s: WARNING: more than %d leaders in priority band [%d]\n",
6112 __FUNCTION__, MAX_COAL_LEADERS, bucket_index);
6113 break;
6114 }
6115 }
6116 p=memorystatus_get_next_proc_locked(&b, p, FALSE);
6117 }
6118
6119 if (nleaders == 0) {
6120 /* Nothing to sort */
6121 return(0);
6122 }
6123
6124 /*
6125 * Sort the coalition leader array, from smallest coalition page count
6126 * to largest coalition page count. When inserted in the priority bucket,
6127 * smallest coalition is handled first, resulting in the last to be jetsammed.
6128 */
6129 if (nleaders > 1) {
6130 qsort(leaders, nleaders, sizeof(memstat_sort_info_t), memstat_asc_cmp);
6131 }
6132
6133#if 0
6134 for (i = 0; i < nleaders; i++) {
6135 printf("%s: coal_leader[%d of %d] pid[%d] pages[%llu] ntasks[%d]\n",
6136 __FUNCTION__, i, nleaders, leaders[i].msi_pid, leaders[i].msi_page_count,
6137 leaders[i].msi_ntasks);
6138 }
6139#endif
6140
6141 /*
6142 * During coalition sorting, processes in a priority band are rearranged
6143 * by being re-inserted at the head of the queue. So, when handling a
6144 * list, the first process that gets moved to the head of the queue,
6145 * ultimately gets pushed toward the queue tail, and hence, jetsams last.
6146 *
6147 * So, for example, the coalition leader is expected to jetsam last,
6148 * after its coalition members. Therefore, the coalition leader is
6149 * inserted at the head of the queue first.
6150 *
6151 * After processing a coalition, the jetsam order is as follows:
6152 * undefs(jetsam first), extensions, xpc services, leader(jetsam last)
6153 */
6154
6155 /*
6156 * Coalition members are rearranged in the priority bucket here,
6157 * based on their coalition role.
6158 */
6159 total_pids_moved = 0;
6160 for (i=0; i < nleaders; i++) {
6161
6162 /* a bit of bookkeeping */
6163 pids_moved = 0;
6164
6165 /* Coalition leaders are jetsammed last, so move into place first */
6166 pid_list[0] = leaders[i].msi_pid;
6167 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list, 1);
6168
6169 /* xpc services should jetsam after extensions */
6170 ntasks = coalition_get_pid_list (leaders[i].msi_coal, COALITION_ROLEMASK_XPC,
6171 coal_sort_order, pid_list, MAX_SORT_PIDS);
6172
6173 if (ntasks > 0) {
6174 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
6175 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
6176 }
6177
6178 /* extensions should jetsam after unmarked processes */
6179 ntasks = coalition_get_pid_list (leaders[i].msi_coal, COALITION_ROLEMASK_EXT,
6180 coal_sort_order, pid_list, MAX_SORT_PIDS);
6181
6182 if (ntasks > 0) {
6183 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
6184 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
6185 }
6186
6187 /* undefined coalition members should be the first to jetsam */
6188 ntasks = coalition_get_pid_list (leaders[i].msi_coal, COALITION_ROLEMASK_UNDEF,
6189 coal_sort_order, pid_list, MAX_SORT_PIDS);
6190
6191 if (ntasks > 0) {
6192 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
6193 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
6194 }
6195
6196#if 0
6197 if (pids_moved == leaders[i].msi_ntasks) {
6198 /*
6199 * All the pids in the coalition were found in this band.
6200 */
6201 printf("%s: pids_moved[%d] equal total coalition ntasks[%d] \n", __FUNCTION__,
6202 pids_moved, leaders[i].msi_ntasks);
6203 } else if (pids_moved > leaders[i].msi_ntasks) {
6204 /*
6205 * Apparently new coalition members showed up during the sort?
6206 */
6207 printf("%s: pids_moved[%d] were greater than expected coalition ntasks[%d] \n", __FUNCTION__,
6208 pids_moved, leaders[i].msi_ntasks);
6209 } else {
6210 /*
6211 * Apparently not all the pids in the coalition were found in this band?
6212 */
6213 printf("%s: pids_moved[%d] were less than expected coalition ntasks[%d] \n", __FUNCTION__,
6214 pids_moved, leaders[i].msi_ntasks);
6215 }
6216#endif
6217
6218 total_pids_moved += pids_moved;
6219
6220 } /* end for */
6221
6222 return(total_pids_moved);
6223}
6224
6225
6226/*
6227 * Traverse a list of pids, searching for each within the priority band provided.
6228 * If pid is found, move it to the front of the priority band.
6229 * Never searches outside the priority band provided.
6230 *
6231 * Input:
6232 * bucket_index - jetsam priority band.
6233 * pid_list - pointer to a list of pids.
6234 * list_sz - number of pids in the list.
6235 *
6236 * Pid list ordering is important in that,
6237 * pid_list[n] is expected to jetsam ahead of pid_list[n+1].
6238 * The sort_order is set by the coalition default.
6239 *
6240 * Return:
6241 * the number of pids found and hence moved within the priority band.
6242 */
6243static int
6244memorystatus_move_list_locked(unsigned int bucket_index, pid_t *pid_list, int list_sz)
6245{
6246 memstat_bucket_t *current_bucket;
6247 int i;
6248 int found_pids = 0;
6249
6250 if ((pid_list == NULL) || (list_sz <= 0)) {
6251 return(0);
6252 }
6253
6254 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
6255 return(0);
6256 }
6257
6258 current_bucket = &memstat_bucket[bucket_index];
6259 for (i=0; i < list_sz; i++) {
6260 unsigned int b = bucket_index;
6261 proc_t p = NULL;
6262 proc_t aProc = NULL;
6263 pid_t aPid;
6264 int list_index;
6265
6266 list_index = ((list_sz - 1) - i);
6267 aPid = pid_list[list_index];
6268
6269 /* never search beyond bucket_index provided */
6270 p = memorystatus_get_first_proc_locked(&b, FALSE);
6271 while (p) {
6272 if (p->p_pid == aPid) {
6273 aProc = p;
6274 break;
6275 }
6276 p = memorystatus_get_next_proc_locked(&b, p, FALSE);
6277 }
6278
6279 if (aProc == NULL) {
6280 /* pid not found in this band, just skip it */
6281 continue;
6282 } else {
6283 TAILQ_REMOVE(&current_bucket->list, aProc, p_memstat_list);
6284 TAILQ_INSERT_HEAD(&current_bucket->list, aProc, p_memstat_list);
6285 found_pids++;
6286 }
6287 }
6288 return(found_pids);
6289}
6290#endif /* CONFIG_JETSAM */