]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/kern_memorystatus.c
xnu-3247.1.106.tar.gz
[apple/xnu.git] / bsd / kern / kern_memorystatus.c
CommitLineData
2d21ac55
A
1/*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
2d21ac55 29
2d21ac55 30#include <kern/sched_prim.h>
6d2010ae 31#include <kern/kalloc.h>
316670eb 32#include <kern/assert.h>
6d2010ae 33#include <kern/debug.h>
fe8ab488 34#include <kern/locks.h>
2d21ac55
A
35#include <kern/task.h>
36#include <kern/thread.h>
316670eb 37#include <kern/host.h>
2d21ac55 38#include <libkern/libkern.h>
3e170ce0 39#include <mach/coalition.h>
316670eb 40#include <mach/mach_time.h>
b0d623f7 41#include <mach/task.h>
316670eb 42#include <mach/host_priv.h>
39236c6e
A
43#include <mach/mach_host.h>
44#include <pexpert/pexpert.h>
3e170ce0 45#include <sys/coalition.h>
316670eb 46#include <sys/kern_event.h>
b0d623f7 47#include <sys/proc.h>
39236c6e 48#include <sys/proc_info.h>
b0d623f7
A
49#include <sys/signal.h>
50#include <sys/signalvar.h>
2d21ac55 51#include <sys/sysctl.h>
316670eb 52#include <sys/sysproto.h>
b0d623f7 53#include <sys/wait.h>
6d2010ae 54#include <sys/tree.h>
316670eb 55#include <sys/priv.h>
39236c6e
A
56#include <vm/vm_pageout.h>
57#include <vm/vm_protos.h>
6d2010ae
A
58
59#if CONFIG_FREEZE
6d2010ae 60#include <vm/vm_map.h>
39236c6e 61#endif /* CONFIG_FREEZE */
6d2010ae 62
316670eb 63#include <sys/kern_memorystatus.h>
6d2010ae 64
fe8ab488
A
65#if CONFIG_JETSAM
66/* For logging clarity */
67static const char *jetsam_kill_cause_name[] = {
68 "" ,
69 "jettisoned" , /* kMemorystatusKilled */
70 "highwater" , /* kMemorystatusKilledHiwat */
71 "vnode-limit" , /* kMemorystatusKilledVnodes */
72 "vm-pageshortage" , /* kMemorystatusKilledVMPageShortage */
73 "vm-thrashing" , /* kMemorystatusKilledVMThrashing */
74 "fc-thrashing" , /* kMemorystatusKilledFCThrashing */
75 "per-process-limit" , /* kMemorystatusKilledPerProcessLimit */
76 "diagnostic" , /* kMemorystatusKilledDiagnostic */
77 "idle-exit" , /* kMemorystatusKilledIdleExit */
78};
79
80/* Does cause indicate vm or fc thrashing? */
81static boolean_t
82is_thrashing(unsigned cause)
83{
84 switch (cause) {
85 case kMemorystatusKilledVMThrashing:
86 case kMemorystatusKilledFCThrashing:
87 return TRUE;
88 default:
89 return FALSE;
90 }
91}
92
93/* Callback into vm_compressor.c to signal that thrashing has been mitigated. */
94extern void vm_thrashing_jetsam_done(void);
95#endif
96
316670eb
A
97/* These are very verbose printfs(), enable with
98 * MEMORYSTATUS_DEBUG_LOG
99 */
100#if MEMORYSTATUS_DEBUG_LOG
101#define MEMORYSTATUS_DEBUG(cond, format, ...) \
102do { \
103 if (cond) { printf(format, ##__VA_ARGS__); } \
104} while(0)
105#else
106#define MEMORYSTATUS_DEBUG(cond, format, ...)
107#endif
6d2010ae 108
3e170ce0
A
109/*
110 * Active / Inactive limit support
111 * proc list must be locked
112 *
113 * The SET_*** macros are used to initialize a limit
114 * for the first time.
115 *
116 * The CACHE_*** macros are use to cache the limit that will
117 * soon be in effect down in the ledgers.
118 */
119
120#define SET_ACTIVE_LIMITS_LOCKED(p, limit, is_fatal) \
121MACRO_BEGIN \
122(p)->p_memstat_memlimit_active = (limit); \
123 (p)->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_ACTIVE_EXC_TRIGGERED; \
124 if (is_fatal) { \
125 (p)->p_memstat_state |= P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL; \
126 } else { \
127 (p)->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL; \
128 } \
129MACRO_END
130
131#define SET_INACTIVE_LIMITS_LOCKED(p, limit, is_fatal) \
132MACRO_BEGIN \
133(p)->p_memstat_memlimit_inactive = (limit); \
134 (p)->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_INACTIVE_EXC_TRIGGERED; \
135 if (is_fatal) { \
136 (p)->p_memstat_state |= P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL; \
137 } else { \
138 (p)->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL; \
139 } \
140MACRO_END
141
142#define CACHE_ACTIVE_LIMITS_LOCKED(p, trigger_exception) \
143MACRO_BEGIN \
144(p)->p_memstat_memlimit = (p)->p_memstat_memlimit_active; \
145 if ((p)->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL) { \
146 (p)->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT; \
147 } else { \
148 (p)->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT; \
149 } \
150 if ((p)->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_EXC_TRIGGERED) { \
151 trigger_exception = FALSE; \
152 } else { \
153 trigger_exception = TRUE; \
154 } \
155MACRO_END
156
157#define CACHE_INACTIVE_LIMITS_LOCKED(p, trigger_exception) \
158MACRO_BEGIN \
159(p)->p_memstat_memlimit = (p)->p_memstat_memlimit_inactive; \
160 if ((p)->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) { \
161 (p)->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT; \
162 } else { \
163 (p)->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT; \
164 } \
165 if ((p)->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_EXC_TRIGGERED) { \
166 trigger_exception = FALSE; \
167 } else { \
168 trigger_exception = TRUE; \
169 } \
170MACRO_END
171
172
39236c6e
A
173/* General tunables */
174
175unsigned long delta_percentage = 5;
176unsigned long critical_threshold_percentage = 5;
177unsigned long idle_offset_percentage = 5;
178unsigned long pressure_threshold_percentage = 15;
179unsigned long freeze_threshold_percentage = 50;
180
316670eb 181/* General memorystatus stuff */
6d2010ae 182
39236c6e
A
183struct klist memorystatus_klist;
184static lck_mtx_t memorystatus_klist_mutex;
6d2010ae 185
39236c6e
A
186static void memorystatus_klist_lock(void);
187static void memorystatus_klist_unlock(void);
6d2010ae 188
39236c6e
A
189static uint64_t memorystatus_idle_delay_time = 0;
190
191/*
192 * Memorystatus kevents
193 */
194
195static int filt_memorystatusattach(struct knote *kn);
196static void filt_memorystatusdetach(struct knote *kn);
197static int filt_memorystatus(struct knote *kn, long hint);
198
199struct filterops memorystatus_filtops = {
200 .f_attach = filt_memorystatusattach,
201 .f_detach = filt_memorystatusdetach,
202 .f_event = filt_memorystatus,
203};
204
205enum {
fe8ab488
A
206 kMemorystatusNoPressure = 0x1,
207 kMemorystatusPressure = 0x2,
208 kMemorystatusLowSwap = 0x4
39236c6e
A
209};
210
211/* Idle guard handling */
212
213static int32_t memorystatus_scheduled_idle_demotions = 0;
214
215static thread_call_t memorystatus_idle_demotion_call;
216
217static void memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2);
218static void memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state);
219static void memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clean_state);
220static void memorystatus_reschedule_idle_demotion_locked(void);
6d2010ae 221
fe8ab488
A
222static void memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert);
223
224boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
225void memorystatus_send_low_swap_note(void);
39236c6e
A
226
227int memorystatus_wakeup = 0;
228
229unsigned int memorystatus_level = 0;
3e170ce0 230unsigned int memorystatus_early_boot_level = 0;
6d2010ae 231
316670eb 232static int memorystatus_list_count = 0;
6d2010ae 233
39236c6e 234#define MEMSTAT_BUCKET_COUNT (JETSAM_PRIORITY_MAX + 1)
6d2010ae 235
39236c6e
A
236typedef struct memstat_bucket {
237 TAILQ_HEAD(, proc) list;
238 int count;
239} memstat_bucket_t;
6d2010ae 240
39236c6e
A
241memstat_bucket_t memstat_bucket[MEMSTAT_BUCKET_COUNT];
242
243uint64_t memstat_idle_demotion_deadline = 0;
6d2010ae 244
316670eb 245static unsigned int memorystatus_dirty_count = 0;
6d2010ae 246
3e170ce0
A
247#if CONFIG_JETSAM
248SYSCTL_INT(_kern, OID_AUTO, max_task_pmem, CTLFLAG_RD|CTLFLAG_LOCKED|CTLFLAG_MASKED, &max_task_footprint_mb, 0, "");
249#endif // CONFIG_JETSAM
250
39236c6e
A
251
252int
253memorystatus_get_level(__unused struct proc *p, struct memorystatus_get_level_args *args, __unused int *ret)
254{
255 user_addr_t level = 0;
256
257 level = args->level;
258
259 if (copyout(&memorystatus_level, level, sizeof(memorystatus_level)) != 0) {
260 return EFAULT;
261 }
262
263 return 0;
264}
265
266static proc_t memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search);
267static proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search);
268
269static void memorystatus_thread(void *param __unused, wait_result_t wr __unused);
6d2010ae 270
316670eb
A
271/* Jetsam */
272
273#if CONFIG_JETSAM
274
3e170ce0
A
275static int memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit);
276
277static int memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
278
279static int memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry);
280
281static int memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
282
283static boolean_t proc_jetsam_state_is_active_locked(proc_t);
284
fe8ab488
A
285int proc_get_memstat_priority(proc_t, boolean_t);
286
39236c6e
A
287/* Kill processes exceeding their limit either under memory pressure (1), or as soon as possible (0) */
288#define LEGACY_HIWATER 1
289
fe8ab488 290static boolean_t memorystatus_idle_snapshot = 0;
39236c6e 291
3e170ce0 292static int memorystatus_highwater_enabled = 1; /* Update the cached memlimit data. This should be removed. */
316670eb 293
316670eb
A
294unsigned int memorystatus_delta = 0;
295
39236c6e 296static unsigned int memorystatus_available_pages_critical_base = 0;
fe8ab488 297//static unsigned int memorystatus_last_foreground_pressure_pages = (unsigned int)-1;
39236c6e 298static unsigned int memorystatus_available_pages_critical_idle_offset = 0;
316670eb 299
3e170ce0
A
300/* Jetsam Loop Detection */
301static boolean_t memorystatus_jld_enabled = TRUE; /* Enables jetsam loop detection on all devices */
302static uint32_t memorystatus_jld_eval_period_msecs = 0; /* Init pass sets this based on device memory size */
303static int memorystatus_jld_eval_aggressive_count = 3; /* Raise the priority max after 'n' aggressive loops */
304static int memorystatus_jld_eval_aggressive_priority_band_max = 15; /* Kill aggressively up through this band */
305
306#if DEVELOPMENT || DEBUG
307/*
308 * Jetsam Loop Detection tunables.
309 */
310
311SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_period_msecs, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_jld_eval_period_msecs, 0, "");
312SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_aggressive_count, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_jld_eval_aggressive_count, 0, "");
313SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_aggressive_priority_band_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_jld_eval_aggressive_priority_band_max, 0, "");
314#endif /* DEVELOPMENT || DEBUG */
315
39236c6e
A
316#if DEVELOPMENT || DEBUG
317static unsigned int memorystatus_jetsam_panic_debug = 0;
316670eb 318
39236c6e
A
319static unsigned int memorystatus_jetsam_policy = kPolicyDefault;
320static unsigned int memorystatus_jetsam_policy_offset_pages_diagnostic = 0;
3e170ce0 321static unsigned int memorystatus_debug_dump_this_bucket = 0;
39236c6e 322#endif
316670eb 323
fe8ab488
A
324static unsigned int memorystatus_thread_wasted_wakeup = 0;
325
326static uint32_t kill_under_pressure_cause = 0;
316670eb 327
3e170ce0
A
328/*
329 * default jetsam snapshot support
330 */
39236c6e
A
331static memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot;
332#define memorystatus_jetsam_snapshot_list memorystatus_jetsam_snapshot->entries
39236c6e
A
333static unsigned int memorystatus_jetsam_snapshot_count = 0;
334static unsigned int memorystatus_jetsam_snapshot_max = 0;
3e170ce0
A
335static uint64_t memorystatus_jetsam_snapshot_last_timestamp = 0;
336static uint64_t memorystatus_jetsam_snapshot_timeout = 0;
337#define JETSAM_SNAPSHOT_TIMEOUT_SECS 30
338
339/*
340 * snapshot support for memstats collected at boot.
341 */
342static memorystatus_jetsam_snapshot_t memorystatus_at_boot_snapshot;
316670eb 343
39236c6e 344static void memorystatus_clear_errors(void);
fe8ab488 345static void memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages);
39236c6e
A
346static uint32_t memorystatus_build_state(proc_t p);
347static void memorystatus_update_levels_locked(boolean_t critical_only);
fe8ab488 348//static boolean_t memorystatus_issue_pressure_kevent(boolean_t pressured);
39236c6e
A
349
350static boolean_t memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause);
3e170ce0
A
351static boolean_t memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, int32_t *priority, uint32_t *errors);
352static boolean_t memorystatus_kill_top_process_aggressive(boolean_t any, uint32_t cause, int aggr_count, int32_t priority_max, uint32_t *errors);
39236c6e
A
353#if LEGACY_HIWATER
354static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors);
355#endif
356
357static boolean_t memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause);
358static boolean_t memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause);
316670eb 359
3e170ce0
A
360/* Priority Band Sorting Routines */
361static int memorystatus_sort_bucket(unsigned int bucket_index, int sort_order);
362static int memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coal_sort_order);
363static void memorystatus_sort_by_largest_process_locked(unsigned int bucket_index);
364static int memorystatus_move_list_locked(unsigned int bucket_index, pid_t *pid_list, int list_sz);
365
366/* qsort routines */
367typedef int (*cmpfunc_t)(const void *a, const void *b);
368extern void qsort(void *a, size_t n, size_t es, cmpfunc_t cmp);
369static int memstat_asc_cmp(const void *a, const void *b);
370
39236c6e 371#endif /* CONFIG_JETSAM */
6d2010ae 372
316670eb 373/* VM pressure */
6d2010ae 374
fe8ab488
A
375extern unsigned int vm_page_free_count;
376extern unsigned int vm_page_active_count;
377extern unsigned int vm_page_inactive_count;
378extern unsigned int vm_page_throttled_count;
379extern unsigned int vm_page_purgeable_count;
380extern unsigned int vm_page_wire_count;
381
316670eb 382#if VM_PRESSURE_EVENTS
6d2010ae 383
39236c6e 384#include "vm_pressure.h"
6d2010ae 385
fe8ab488 386extern boolean_t memorystatus_warn_process(pid_t pid, boolean_t critical);
316670eb 387
39236c6e 388vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal;
316670eb 389
fe8ab488
A
390#if CONFIG_MEMORYSTATUS
391unsigned int memorystatus_available_pages = (unsigned int)-1;
392unsigned int memorystatus_available_pages_pressure = 0;
393unsigned int memorystatus_available_pages_critical = 0;
394unsigned int memorystatus_frozen_count = 0;
395unsigned int memorystatus_suspended_count = 0;
396
397/*
398 * We use this flag to signal if we have any HWM offenders
399 * on the system. This way we can reduce the number of wakeups
400 * of the memorystatus_thread when the system is between the
401 * "pressure" and "critical" threshold.
402 *
403 * The (re-)setting of this variable is done without any locks
404 * or synchronization simply because it is not possible (currently)
405 * to keep track of HWM offenders that drop down below their memory
406 * limit and/or exit. So, we choose to burn a couple of wasted wakeups
407 * by allowing the unguarded modification of this variable.
408 */
409boolean_t memorystatus_hwm_candidates = 0;
410
411static int memorystatus_send_note(int event_code, void *data, size_t data_length);
412#endif /* CONFIG_MEMORYSTATUS */
413
316670eb
A
414#endif /* VM_PRESSURE_EVENTS */
415
316670eb
A
416/* Freeze */
417
418#if CONFIG_FREEZE
419
316670eb
A
420boolean_t memorystatus_freeze_enabled = FALSE;
421int memorystatus_freeze_wakeup = 0;
422
3e170ce0
A
423lck_grp_attr_t *freezer_lck_grp_attr;
424lck_grp_t *freezer_lck_grp;
425static lck_mtx_t freezer_mutex;
426
316670eb
A
427static inline boolean_t memorystatus_can_freeze_processes(void);
428static boolean_t memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low);
429
430static void memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused);
431
432/* Thresholds */
433static unsigned int memorystatus_freeze_threshold = 0;
434
fe8ab488
A
435static unsigned int memorystatus_freeze_pages_min = 0;
436static unsigned int memorystatus_freeze_pages_max = 0;
316670eb
A
437
438static unsigned int memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT;
439
3e170ce0
A
440static unsigned int memorystatus_freeze_daily_mb_max = FREEZE_DAILY_MB_MAX_DEFAULT;
441
316670eb
A
442/* Stats */
443static uint64_t memorystatus_freeze_count = 0;
444static uint64_t memorystatus_freeze_pageouts = 0;
6d2010ae
A
445
446/* Throttling */
316670eb
A
447static throttle_interval_t throttle_intervals[] = {
448 { 60, 8, 0, 0, { 0, 0 }, FALSE }, /* 1 hour intermediate interval, 8x burst */
6d2010ae
A
449 { 24 * 60, 1, 0, 0, { 0, 0 }, FALSE }, /* 24 hour long interval, no burst */
450};
451
316670eb 452static uint64_t memorystatus_freeze_throttle_count = 0;
6d2010ae 453
39236c6e 454static unsigned int memorystatus_suspended_footprint_total = 0;
6d2010ae 455
3e170ce0
A
456extern uint64_t vm_swap_get_free_space(void);
457
458static boolean_t memorystatus_freeze_update_throttle();
459
39236c6e 460#endif /* CONFIG_FREEZE */
6d2010ae 461
316670eb 462/* Debug */
6d2010ae 463
fe8ab488
A
464extern struct knote *vm_find_knote_from_pid(pid_t, struct klist *);
465
6d2010ae 466#if DEVELOPMENT || DEBUG
6d2010ae 467
39236c6e
A
468#if CONFIG_JETSAM
469
3e170ce0
A
470static void
471memorystatus_debug_dump_bucket_locked (unsigned int bucket_index)
472{
473 proc_t p = NULL;
474 uint32_t pages = 0;
475 uint32_t pages_in_mb = 0;
476 unsigned int b = bucket_index;
477 boolean_t traverse_all_buckets = FALSE;
478
479 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
480 traverse_all_buckets = TRUE;
481 b = 0;
482 } else {
483 traverse_all_buckets = FALSE;
484 b = bucket_index;
485 }
486
487 /*
488 * Missing from this dump is the value actually
489 * stored in the ledger... also, format could be better.
490 */
491 printf("memorystatus_debug_dump ***START***\n");
492 printf("bucket [pid] [pages/pages-mb] state [EP / RP] dirty deadline [C-limit / A-limit / IA-limit] name\n");
493 p = memorystatus_get_first_proc_locked(&b, traverse_all_buckets);
494 while (p) {
495 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
496 pages_in_mb = (pages * 4096) /1024 / 1024;
497 printf("%d [%d] [%d/%dMB] 0x%x [%d / %d] 0x%x %lld [%d%s / %d%s / %d%s] %s\n",
498 b, p->p_pid, pages, pages_in_mb,
499 p->p_memstat_state, p->p_memstat_effectivepriority, p->p_memstat_requestedpriority, p->p_memstat_dirty, p->p_memstat_idledeadline,
500 p->p_memstat_memlimit,
501 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"),
502 p->p_memstat_memlimit_active,
503 (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL ? "F " : "NF"),
504 p->p_memstat_memlimit_inactive,
505 (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL ? "F " : "NF"),
506 (p->p_comm ? p->p_comm : "unknown"));
507 p = memorystatus_get_next_proc_locked(&b, p, traverse_all_buckets);
508 }
509 printf("memorystatus_debug_dump ***END***\n");
510}
511
512static int
513sysctl_memorystatus_debug_dump_bucket SYSCTL_HANDLER_ARGS
514{
515#pragma unused(oidp, arg2)
516 int bucket_index = 0;
517 int error;
518 error = SYSCTL_OUT(req, arg1, sizeof(int));
519 if (error || !req->newptr) {
520 return (error);
521 }
522 error = SYSCTL_IN(req, &bucket_index, sizeof(int));
523 if (error || !req->newptr) {
524 return (error);
525 }
526 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
527 /*
528 * All jetsam buckets will be dumped.
529 */
530 } else {
531 /*
532 * Only a single bucket will be dumped.
533 */
534 }
535
536 proc_list_lock();
537 memorystatus_debug_dump_bucket_locked(bucket_index);
538 proc_list_unlock();
539 memorystatus_debug_dump_this_bucket = bucket_index;
540 return (error);
541}
542
543/*
544 * Debug aid to look at jetsam buckets and proc jetsam fields.
545 * Use this sysctl to act on a particular jetsam bucket.
546 * Writing the sysctl triggers the dump.
547 * Usage: sysctl kern.memorystatus_debug_dump_this_bucket=<bucket_index>
548 */
549
550SYSCTL_PROC(_kern, OID_AUTO, memorystatus_debug_dump_this_bucket, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_debug_dump_this_bucket, 0, sysctl_memorystatus_debug_dump_bucket, "I", "");
551
552
39236c6e
A
553/* Debug aid to aid determination of limit */
554
555static int
556sysctl_memorystatus_highwater_enable SYSCTL_HANDLER_ARGS
557{
558#pragma unused(oidp, arg2)
559 proc_t p;
560 unsigned int b = 0;
561 int error, enable = 0;
39236c6e
A
562
563 error = SYSCTL_OUT(req, arg1, sizeof(int));
564 if (error || !req->newptr) {
565 return (error);
566 }
567
568 error = SYSCTL_IN(req, &enable, sizeof(int));
569 if (error || !req->newptr) {
570 return (error);
571 }
572
573 if (!(enable == 0 || enable == 1)) {
574 return EINVAL;
575 }
576
577 proc_list_lock();
578
579 p = memorystatus_get_first_proc_locked(&b, TRUE);
580 while (p) {
3e170ce0
A
581 boolean_t trigger_exception;
582
39236c6e 583 if (enable) {
3e170ce0
A
584 /*
585 * No need to consider P_MEMSTAT_MEMLIMIT_BACKGROUND anymore.
586 * Background limits are described via the inactive limit slots.
587 */
588
589 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
590 CACHE_ACTIVE_LIMITS_LOCKED(p, trigger_exception);
39236c6e 591 } else {
3e170ce0 592 CACHE_INACTIVE_LIMITS_LOCKED(p, trigger_exception);
39236c6e 593 }
3e170ce0 594
39236c6e 595 } else {
3e170ce0
A
596 /*
597 * Disabling limits does not touch the stored variants.
598 * Set the cached limit fields to system_wide defaults.
599 */
600 p->p_memstat_memlimit = -1;
601 p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
602 trigger_exception = TRUE;
fe8ab488 603 }
3e170ce0
A
604
605 /*
606 * Enforce the cached limit by writing to the ledger.
607 */
608 task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit: -1, NULL, trigger_exception);
609
39236c6e
A
610 p = memorystatus_get_next_proc_locked(&b, p, TRUE);
611 }
612
613 memorystatus_highwater_enabled = enable;
614
615 proc_list_unlock();
616
617 return 0;
3e170ce0 618
39236c6e
A
619}
620
fe8ab488
A
621SYSCTL_INT(_kern, OID_AUTO, memorystatus_idle_snapshot, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_idle_snapshot, 0, "");
622
39236c6e
A
623SYSCTL_PROC(_kern, OID_AUTO, memorystatus_highwater_enabled, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_highwater_enabled, 0, sysctl_memorystatus_highwater_enable, "I", "");
624
625SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
626SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_available_pages_critical, 0, "");
627SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_base, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_critical_base, 0, "");
39236c6e 628SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_idle_offset, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_critical_idle_offset, 0, "");
316670eb
A
629
630/* Diagnostic code */
39236c6e 631
316670eb
A
632enum {
633 kJetsamDiagnosticModeNone = 0,
634 kJetsamDiagnosticModeAll = 1,
635 kJetsamDiagnosticModeStopAtFirstActive = 2,
636 kJetsamDiagnosticModeCount
637} jetsam_diagnostic_mode = kJetsamDiagnosticModeNone;
638
639static int jetsam_diagnostic_suspended_one_active_proc = 0;
640
641static int
642sysctl_jetsam_diagnostic_mode SYSCTL_HANDLER_ARGS
643{
644#pragma unused(arg1, arg2)
645
646 const char *diagnosticStrings[] = {
647 "jetsam: diagnostic mode: resetting critical level.",
648 "jetsam: diagnostic mode: will examine all processes",
649 "jetsam: diagnostic mode: will stop at first active process"
650 };
651
652 int error, val = jetsam_diagnostic_mode;
653 boolean_t changed = FALSE;
654
655 error = sysctl_handle_int(oidp, &val, 0, req);
656 if (error || !req->newptr)
657 return (error);
658 if ((val < 0) || (val >= kJetsamDiagnosticModeCount)) {
659 printf("jetsam: diagnostic mode: invalid value - %d\n", val);
660 return EINVAL;
661 }
662
39236c6e 663 proc_list_lock();
316670eb
A
664
665 if ((unsigned int) val != jetsam_diagnostic_mode) {
666 jetsam_diagnostic_mode = val;
667
668 memorystatus_jetsam_policy &= ~kPolicyDiagnoseActive;
669
670 switch (jetsam_diagnostic_mode) {
671 case kJetsamDiagnosticModeNone:
672 /* Already cleared */
673 break;
674 case kJetsamDiagnosticModeAll:
675 memorystatus_jetsam_policy |= kPolicyDiagnoseAll;
676 break;
677 case kJetsamDiagnosticModeStopAtFirstActive:
678 memorystatus_jetsam_policy |= kPolicyDiagnoseFirst;
679 break;
680 default:
681 /* Already validated */
682 break;
683 }
684
39236c6e 685 memorystatus_update_levels_locked(FALSE);
316670eb
A
686 changed = TRUE;
687 }
688
39236c6e 689 proc_list_unlock();
316670eb
A
690
691 if (changed) {
692 printf("%s\n", diagnosticStrings[val]);
693 }
694
695 return (0);
696}
697
39236c6e 698SYSCTL_PROC(_debug, OID_AUTO, jetsam_diagnostic_mode, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED|CTLFLAG_ANYBODY,
316670eb
A
699 &jetsam_diagnostic_mode, 0, sysctl_jetsam_diagnostic_mode, "I", "Jetsam Diagnostic Mode");
700
39236c6e 701SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jetsam_policy_offset_pages_diagnostic, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_jetsam_policy_offset_pages_diagnostic, 0, "");
316670eb
A
702
703#if VM_PRESSURE_EVENTS
704
39236c6e 705SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_pressure, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_pressure, 0, "");
316670eb 706
316670eb 707
fe8ab488
A
708/*
709 * This routine is used for targeted notifications
710 * regardless of system memory pressure.
711 * "memnote" is the current user.
712 */
316670eb
A
713
714static int
715sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS
716{
717#pragma unused(arg1, arg2)
718
fe8ab488
A
719 int error = 0, pid = 0;
720 int ret = 0;
721 struct knote *kn = NULL;
3e170ce0 722 boolean_t found_knote = FALSE;
316670eb
A
723
724 error = sysctl_handle_int(oidp, &pid, 0, req);
725 if (error || !req->newptr)
726 return (error);
727
fe8ab488
A
728 /*
729 * We inspect 3 lists here for targeted notifications:
730 * - memorystatus_klist
731 * - vm_pressure_klist
732 * - vm_pressure_dormant_klist
733 *
734 * The vm_pressure_* lists are tied to the old VM_PRESSURE
735 * notification mechanism. We intend to stop using that
736 * mechanism and, in turn, get rid of the 2 lists and
737 * vm_dispatch_pressure_note_to_pid() too.
738 */
739
740 memorystatus_klist_lock();
3e170ce0
A
741
742 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
743 proc_t knote_proc = kn->kn_kq->kq_p;
744 pid_t knote_pid = knote_proc->p_pid;
745
746 if (knote_pid == pid) {
747 /*
748 * Forcibly send this pid a "warning" memory pressure notification.
749 */
750 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
751 found_knote = TRUE;
752 }
753 }
754
755 if (found_knote) {
756 KNOTE(&memorystatus_klist, 0);
757 ret = 0;
fe8ab488
A
758 } else {
759 ret = vm_dispatch_pressure_note_to_pid(pid, FALSE);
760 }
3e170ce0 761
fe8ab488
A
762 memorystatus_klist_unlock();
763
764 return ret;
316670eb
A
765}
766
767SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
768 0, 0, &sysctl_memorystatus_vm_pressure_send, "I", "");
769
770#endif /* VM_PRESSURE_EVENTS */
771
772#endif /* CONFIG_JETSAM */
773
774#if CONFIG_FREEZE
775
3e170ce0
A
776SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_daily_mb_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_daily_mb_max, 0, "");
777
39236c6e 778SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_threshold, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_threshold, 0, "");
316670eb 779
39236c6e
A
780SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_min, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_min, 0, "");
781SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_max, 0, "");
316670eb 782
39236c6e
A
783SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_count, "");
784SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_pageouts, "");
785SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_throttle_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_count, "");
786SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_min_processes, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_suspended_threshold, 0, "");
316670eb
A
787
788boolean_t memorystatus_freeze_throttle_enabled = TRUE;
39236c6e 789SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_throttle_enabled, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_enabled, 0, "");
316670eb
A
790
791/*
fe8ab488 792 * Manual trigger of freeze and thaw for dev / debug kernels only.
316670eb
A
793 */
794static int
795sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS
796{
797#pragma unused(arg1, arg2)
316670eb
A
798 int error, pid = 0;
799 proc_t p;
800
fe8ab488
A
801 if (memorystatus_freeze_enabled == FALSE) {
802 return ENOTSUP;
803 }
804
316670eb
A
805 error = sysctl_handle_int(oidp, &pid, 0, req);
806 if (error || !req->newptr)
807 return (error);
808
3e170ce0
A
809 if (pid == 2) {
810 vm_pageout_anonymous_pages();
811
812 return 0;
813 }
814
815 lck_mtx_lock(&freezer_mutex);
816
316670eb
A
817 p = proc_find(pid);
818 if (p != NULL) {
819 uint32_t purgeable, wired, clean, dirty;
820 boolean_t shared;
39236c6e
A
821 uint32_t max_pages = 0;
822
fe8ab488 823 if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) {
3e170ce0
A
824
825 unsigned int avail_swap_space = 0; /* in pages. */
826
827 if (DEFAULT_FREEZER_IS_ACTIVE) {
828 /*
829 * Freezer backed by default pager and swap file(s).
830 */
831 avail_swap_space = default_pager_swap_pages_free();
832 } else {
833 /*
834 * Freezer backed by the compressor and swap file(s)
835 * while will hold compressed data.
836 */
837 avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64;
838 }
839
840 max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max);
841
39236c6e 842 } else {
3e170ce0
A
843 /*
844 * We only have the compressor without any swap.
845 */
39236c6e
A
846 max_pages = UINT32_MAX - 1;
847 }
3e170ce0 848
39236c6e 849 error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
316670eb 850 proc_rele(p);
316670eb 851
39236c6e
A
852 if (error)
853 error = EIO;
3e170ce0
A
854
855 lck_mtx_unlock(&freezer_mutex);
39236c6e
A
856 return error;
857 }
3e170ce0
A
858
859 lck_mtx_unlock(&freezer_mutex);
316670eb
A
860 return EINVAL;
861}
862
863SYSCTL_PROC(_kern, OID_AUTO, memorystatus_freeze, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
864 0, 0, &sysctl_memorystatus_freeze, "I", "");
865
866static int
867sysctl_memorystatus_available_pages_thaw SYSCTL_HANDLER_ARGS
868{
869#pragma unused(arg1, arg2)
870
871 int error, pid = 0;
872 proc_t p;
873
fe8ab488
A
874 if (memorystatus_freeze_enabled == FALSE) {
875 return ENOTSUP;
876 }
877
316670eb
A
878 error = sysctl_handle_int(oidp, &pid, 0, req);
879 if (error || !req->newptr)
880 return (error);
881
882 p = proc_find(pid);
883 if (p != NULL) {
39236c6e 884 error = task_thaw(p->task);
316670eb 885 proc_rele(p);
39236c6e
A
886
887 if (error)
888 error = EIO;
889 return error;
316670eb
A
890 }
891
892 return EINVAL;
893}
894
895SYSCTL_PROC(_kern, OID_AUTO, memorystatus_thaw, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
896 0, 0, &sysctl_memorystatus_available_pages_thaw, "I", "");
6d2010ae 897
6d2010ae 898#endif /* CONFIG_FREEZE */
2d21ac55 899
fe8ab488
A
900#endif /* DEVELOPMENT || DEBUG */
901
39236c6e
A
902extern kern_return_t kernel_thread_start_priority(thread_continue_t continuation,
903 void *parameter,
904 integer_t priority,
905 thread_t *new_thread);
906
fe8ab488 907#if CONFIG_JETSAM
3e170ce0
A
908/*
909 * Picks the sorting routine for a given jetsam priority band.
910 *
911 * Input:
912 * bucket_index - jetsam priority band to be sorted.
913 * sort_order - JETSAM_SORT_xxx from kern_memorystatus.h
914 * Currently sort_order is only meaningful when handling
915 * coalitions.
916 *
917 * Return:
918 * 0 on success
919 * non-0 on failure
920 */
921static int memorystatus_sort_bucket(unsigned int bucket_index, int sort_order)
922{
923 int coal_sort_order;
924
925 /*
926 * Verify the jetsam priority
927 */
928 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
929 return(EINVAL);
930 }
931
932#if DEVELOPMENT || DEBUG
933 if (sort_order == JETSAM_SORT_DEFAULT) {
934 coal_sort_order = COALITION_SORT_DEFAULT;
935 } else {
936 coal_sort_order = sort_order; /* only used for testing scenarios */
937 }
938#else
939 /* Verify default */
940 if (sort_order == JETSAM_SORT_DEFAULT) {
941 coal_sort_order = COALITION_SORT_DEFAULT;
942 } else {
943 return(EINVAL);
944 }
945#endif
946
947 proc_list_lock();
948 switch (bucket_index) {
949 case JETSAM_PRIORITY_FOREGROUND:
950 if (memorystatus_sort_by_largest_coalition_locked(bucket_index, coal_sort_order) == 0) {
951 /*
952 * Fall back to per process sorting when zero coalitions are found.
953 */
954 memorystatus_sort_by_largest_process_locked(bucket_index);
955 }
956 break;
957 default:
958 memorystatus_sort_by_largest_process_locked(bucket_index);
959 break;
960 }
961 proc_list_unlock();
962
963 return(0);
964}
965
fe8ab488
A
966/*
967 * Sort processes by size for a single jetsam bucket.
968 */
969
970static void memorystatus_sort_by_largest_process_locked(unsigned int bucket_index)
971{
972 proc_t p = NULL, insert_after_proc = NULL, max_proc = NULL;
3e170ce0 973 proc_t next_p = NULL, prev_max_proc = NULL;
fe8ab488
A
974 uint32_t pages = 0, max_pages = 0;
975 memstat_bucket_t *current_bucket;
976
977 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
978 return;
979 }
980
981 current_bucket = &memstat_bucket[bucket_index];
982
983 p = TAILQ_FIRST(&current_bucket->list);
984
3e170ce0 985 while (p) {
fe8ab488
A
986 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
987 max_pages = pages;
3e170ce0
A
988 max_proc = p;
989 prev_max_proc = p;
990
991 while ((next_p = TAILQ_NEXT(p, p_memstat_list)) != NULL) {
992 /* traversing list until we find next largest process */
993 p=next_p;
fe8ab488 994 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
fe8ab488
A
995 if (pages > max_pages) {
996 max_pages = pages;
997 max_proc = p;
998 }
fe8ab488
A
999 }
1000
3e170ce0
A
1001 if (prev_max_proc != max_proc) {
1002 /* found a larger process, place it in the list */
fe8ab488 1003 TAILQ_REMOVE(&current_bucket->list, max_proc, p_memstat_list);
fe8ab488
A
1004 if (insert_after_proc == NULL) {
1005 TAILQ_INSERT_HEAD(&current_bucket->list, max_proc, p_memstat_list);
1006 } else {
1007 TAILQ_INSERT_AFTER(&current_bucket->list, insert_after_proc, max_proc, p_memstat_list);
1008 }
3e170ce0
A
1009 prev_max_proc = max_proc;
1010 }
fe8ab488 1011
3e170ce0 1012 insert_after_proc = max_proc;
fe8ab488 1013
3e170ce0 1014 p = TAILQ_NEXT(max_proc, p_memstat_list);
fe8ab488
A
1015 }
1016}
1017
1018#endif /* CONFIG_JETSAM */
1019
39236c6e
A
1020static proc_t memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search) {
1021 memstat_bucket_t *current_bucket;
1022 proc_t next_p;
1023
1024 if ((*bucket_index) >= MEMSTAT_BUCKET_COUNT) {
1025 return NULL;
1026 }
1027
1028 current_bucket = &memstat_bucket[*bucket_index];
1029 next_p = TAILQ_FIRST(&current_bucket->list);
1030 if (!next_p && search) {
1031 while (!next_p && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
1032 current_bucket = &memstat_bucket[*bucket_index];
1033 next_p = TAILQ_FIRST(&current_bucket->list);
1034 }
1035 }
1036
1037 return next_p;
1038}
1039
1040static proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search) {
1041 memstat_bucket_t *current_bucket;
1042 proc_t next_p;
1043
1044 if (!p || ((*bucket_index) >= MEMSTAT_BUCKET_COUNT)) {
1045 return NULL;
1046 }
1047
1048 next_p = TAILQ_NEXT(p, p_memstat_list);
1049 while (!next_p && search && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
1050 current_bucket = &memstat_bucket[*bucket_index];
1051 next_p = TAILQ_FIRST(&current_bucket->list);
1052 }
1053
1054 return next_p;
1055}
316670eb
A
1056
1057__private_extern__ void
1058memorystatus_init(void)
1059{
1060 thread_t thread = THREAD_NULL;
1061 kern_return_t result;
39236c6e
A
1062 int i;
1063
fe8ab488
A
1064#if CONFIG_FREEZE
1065 memorystatus_freeze_pages_min = FREEZE_PAGES_MIN;
1066 memorystatus_freeze_pages_max = FREEZE_PAGES_MAX;
1067#endif
1068
39236c6e
A
1069 nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_idle_delay_time);
1070
1071 /* Init buckets */
1072 for (i = 0; i < MEMSTAT_BUCKET_COUNT; i++) {
1073 TAILQ_INIT(&memstat_bucket[i].list);
1074 memstat_bucket[i].count = 0;
1075 }
316670eb 1076
39236c6e 1077 memorystatus_idle_demotion_call = thread_call_allocate((thread_call_func_t)memorystatus_perform_idle_demotion, NULL);
316670eb 1078
39236c6e
A
1079 /* Apply overrides */
1080 PE_get_default("kern.jetsam_delta", &delta_percentage, sizeof(delta_percentage));
1081 assert(delta_percentage < 100);
1082 PE_get_default("kern.jetsam_critical_threshold", &critical_threshold_percentage, sizeof(critical_threshold_percentage));
1083 assert(critical_threshold_percentage < 100);
1084 PE_get_default("kern.jetsam_idle_offset", &idle_offset_percentage, sizeof(idle_offset_percentage));
1085 assert(idle_offset_percentage < 100);
1086 PE_get_default("kern.jetsam_pressure_threshold", &pressure_threshold_percentage, sizeof(pressure_threshold_percentage));
1087 assert(pressure_threshold_percentage < 100);
1088 PE_get_default("kern.jetsam_freeze_threshold", &freeze_threshold_percentage, sizeof(freeze_threshold_percentage));
1089 assert(freeze_threshold_percentage < 100);
316670eb 1090
39236c6e 1091#if CONFIG_JETSAM
3e170ce0
A
1092 /* device tree can request to take snapshots for idle-exit kills by default */
1093 PE_get_default("kern.jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot));
1094
39236c6e 1095 memorystatus_delta = delta_percentage * atop_64(max_mem) / 100;
39236c6e 1096 memorystatus_available_pages_critical_idle_offset = idle_offset_percentage * atop_64(max_mem) / 100;
39236c6e
A
1097 memorystatus_available_pages_critical_base = (critical_threshold_percentage / delta_percentage) * memorystatus_delta;
1098
1099 memorystatus_jetsam_snapshot_max = maxproc;
1100 memorystatus_jetsam_snapshot =
1101 (memorystatus_jetsam_snapshot_t*)kalloc(sizeof(memorystatus_jetsam_snapshot_t) +
1102 sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max);
1103 if (!memorystatus_jetsam_snapshot) {
1104 panic("Could not allocate memorystatus_jetsam_snapshot");
1105 }
1106
3e170ce0
A
1107 nanoseconds_to_absolutetime((uint64_t)JETSAM_SNAPSHOT_TIMEOUT_SECS * NSEC_PER_SEC, &memorystatus_jetsam_snapshot_timeout);
1108
1109 memset(&memorystatus_at_boot_snapshot, 0, sizeof(memorystatus_jetsam_snapshot_t));
1110
39236c6e
A
1111 /* No contention at this point */
1112 memorystatus_update_levels_locked(FALSE);
3e170ce0
A
1113
1114 /* Jetsam Loop Detection */
1115 if (max_mem <= (512 * 1024 * 1024)) {
1116 /* 512 MB devices */
1117 memorystatus_jld_eval_period_msecs = 8000; /* 8000 msecs == 8 second window */
1118 } else {
1119 /* 1GB and larger devices */
1120 memorystatus_jld_eval_period_msecs = 6000; /* 6000 msecs == 6 second window */
1121 }
39236c6e
A
1122#endif
1123
316670eb 1124#if CONFIG_FREEZE
39236c6e 1125 memorystatus_freeze_threshold = (freeze_threshold_percentage / delta_percentage) * memorystatus_delta;
316670eb 1126#endif
39236c6e
A
1127
1128 result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &thread);
316670eb
A
1129 if (result == KERN_SUCCESS) {
1130 thread_deallocate(thread);
1131 } else {
1132 panic("Could not create memorystatus_thread");
1133 }
39236c6e 1134}
316670eb 1135
39236c6e
A
1136/* Centralised for the purposes of allowing panic-on-jetsam */
1137extern void
1138vm_wake_compactor_swapper(void);
316670eb 1139
fe8ab488
A
1140/*
1141 * The jetsam no frills kill call
1142 * Return: 0 on success
1143 * error code on failure (EINVAL...)
1144 */
1145static int
1146jetsam_do_kill(proc_t p, int jetsam_flags) {
1147 int error = 0;
1148 error = exit1_internal(p, W_EXITCODE(0, SIGKILL), (int *)NULL, FALSE, FALSE, jetsam_flags);
1149 return(error);
1150}
1151
1152/*
1153 * Wrapper for processes exiting with memorystatus details
1154 */
39236c6e
A
1155static boolean_t
1156memorystatus_do_kill(proc_t p, uint32_t cause) {
1157
fe8ab488
A
1158 int error = 0;
1159 __unused pid_t victim_pid = p->p_pid;
1160
1161 KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_START,
1162 victim_pid, cause, vm_page_free_count, 0, 0);
39236c6e
A
1163
1164#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
1165 if (memorystatus_jetsam_panic_debug & (1 << cause)) {
1166 panic("memorystatus_do_kill(): jetsam debug panic (cause: %d)", cause);
316670eb 1167 }
39236c6e
A
1168#else
1169#pragma unused(cause)
316670eb 1170#endif
39236c6e
A
1171 int jetsam_flags = P_LTERM_JETSAM;
1172 switch (cause) {
1173 case kMemorystatusKilledHiwat: jetsam_flags |= P_JETSAM_HIWAT; break;
1174 case kMemorystatusKilledVnodes: jetsam_flags |= P_JETSAM_VNODE; break;
1175 case kMemorystatusKilledVMPageShortage: jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break;
1176 case kMemorystatusKilledVMThrashing: jetsam_flags |= P_JETSAM_VMTHRASHING; break;
fe8ab488 1177 case kMemorystatusKilledFCThrashing: jetsam_flags |= P_JETSAM_FCTHRASHING; break;
39236c6e
A
1178 case kMemorystatusKilledPerProcessLimit: jetsam_flags |= P_JETSAM_PID; break;
1179 case kMemorystatusKilledIdleExit: jetsam_flags |= P_JETSAM_IDLEEXIT; break;
1180 }
fe8ab488
A
1181 error = jetsam_do_kill(p, jetsam_flags);
1182
1183 KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_END,
1184 victim_pid, cause, vm_page_free_count, error, 0);
39236c6e 1185
3e170ce0 1186 vm_wake_compactor_swapper();
fe8ab488
A
1187
1188 return (error == 0);
316670eb
A
1189}
1190
1191/*
1192 * Node manipulation
1193 */
1194
1195static void
39236c6e
A
1196memorystatus_check_levels_locked(void) {
1197#if CONFIG_JETSAM
1198 /* Update levels */
1199 memorystatus_update_levels_locked(TRUE);
1200#endif
1201}
316670eb 1202
39236c6e
A
1203static void
1204memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2)
1205{
1206 proc_t p;
1207 uint64_t current_time;
1208 memstat_bucket_t *demotion_bucket;
1209
1210 MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion()\n");
1211
1212 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_START, 0, 0, 0, 0, 0);
1213
1214 current_time = mach_absolute_time();
1215
1216 proc_list_lock();
316670eb 1217
39236c6e
A
1218 demotion_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE_DEFERRED];
1219 p = TAILQ_FIRST(&demotion_bucket->list);
1220
1221 while (p) {
1222 MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion() found %d\n", p->p_pid);
1223
1224 assert(p->p_memstat_idledeadline);
1225 assert(p->p_memstat_dirty & P_DIRTY_DEFER_IN_PROGRESS);
1226 assert((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED);
1227
1228 if (current_time >= p->p_memstat_idledeadline) {
1229#if DEBUG || DEVELOPMENT
1230 if (!(p->p_memstat_dirty & P_DIRTY_MARKED)) {
fe8ab488
A
1231 printf("memorystatus_perform_idle_demotion: moving process %d [%s] to idle band, but never dirtied (0x%x)!\n",
1232 p->p_pid, (p->p_comm ? p->p_comm : "(unknown)"), p->p_memstat_dirty);
39236c6e
A
1233 }
1234#endif
1235 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
fe8ab488 1236 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, false);
39236c6e
A
1237
1238 // The prior process has moved out of the demotion bucket, so grab the new head and continue
1239 p = TAILQ_FIRST(&demotion_bucket->list);
1240 continue;
316670eb 1241 }
39236c6e
A
1242
1243 // No further candidates
1244 break;
316670eb 1245 }
39236c6e
A
1246
1247 memorystatus_reschedule_idle_demotion_locked();
1248
1249 proc_list_unlock();
316670eb 1250
39236c6e 1251 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
316670eb
A
1252}
1253
1254static void
39236c6e
A
1255memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state)
1256{
fe8ab488
A
1257 boolean_t present_in_deferred_bucket = FALSE;
1258
1259 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1260 present_in_deferred_bucket = TRUE;
1261 }
1262
3e170ce0 1263 MEMORYSTATUS_DEBUG(1, "memorystatus_schedule_idle_demotion_locked: scheduling demotion to idle band for pid %d (dirty:0x%x, set_state %d, demotions %d).\n",
39236c6e 1264 p->p_pid, p->p_memstat_dirty, set_state, memorystatus_scheduled_idle_demotions);
316670eb 1265
fe8ab488 1266 assert((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED);
316670eb 1267
39236c6e
A
1268 if (set_state) {
1269 assert(p->p_memstat_idledeadline == 0);
fe8ab488 1270 p->p_memstat_dirty |= P_DIRTY_DEFER_IN_PROGRESS;
39236c6e 1271 p->p_memstat_idledeadline = mach_absolute_time() + memorystatus_idle_delay_time;
316670eb 1272 }
39236c6e 1273
fe8ab488 1274 assert(p->p_memstat_idledeadline);
39236c6e 1275
fe8ab488
A
1276 if (present_in_deferred_bucket == FALSE) {
1277 memorystatus_scheduled_idle_demotions++;
1278 }
316670eb
A
1279}
1280
39236c6e
A
1281static void
1282memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clear_state)
316670eb 1283{
fe8ab488
A
1284 boolean_t present_in_deferred_bucket = FALSE;
1285
1286 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1287 present_in_deferred_bucket = TRUE;
1288 assert(p->p_memstat_idledeadline);
1289 }
1290
3e170ce0 1291 MEMORYSTATUS_DEBUG(1, "memorystatus_invalidate_idle_demotion(): invalidating demotion to idle band for pid %d (clear_state %d, demotions %d).\n",
39236c6e
A
1292 p->p_pid, clear_state, memorystatus_scheduled_idle_demotions);
1293
39236c6e
A
1294
1295 if (clear_state) {
1296 p->p_memstat_idledeadline = 0;
1297 p->p_memstat_dirty &= ~P_DIRTY_DEFER_IN_PROGRESS;
316670eb 1298 }
39236c6e 1299
fe8ab488
A
1300 if (present_in_deferred_bucket == TRUE) {
1301 memorystatus_scheduled_idle_demotions--;
1302 }
1303
39236c6e 1304 assert(memorystatus_scheduled_idle_demotions >= 0);
316670eb
A
1305}
1306
1307static void
39236c6e
A
1308memorystatus_reschedule_idle_demotion_locked(void) {
1309 if (0 == memorystatus_scheduled_idle_demotions) {
1310 if (memstat_idle_demotion_deadline) {
1311 /* Transitioned 1->0, so cancel next call */
1312 thread_call_cancel(memorystatus_idle_demotion_call);
1313 memstat_idle_demotion_deadline = 0;
1314 }
1315 } else {
1316 memstat_bucket_t *demotion_bucket;
1317 proc_t p;
1318 demotion_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE_DEFERRED];
1319 p = TAILQ_FIRST(&demotion_bucket->list);
39236c6e 1320
fe8ab488
A
1321 assert(p && p->p_memstat_idledeadline);
1322
1323 if (memstat_idle_demotion_deadline != p->p_memstat_idledeadline){
1324 thread_call_enter_delayed(memorystatus_idle_demotion_call, p->p_memstat_idledeadline);
1325 memstat_idle_demotion_deadline = p->p_memstat_idledeadline;
39236c6e
A
1326 }
1327 }
316670eb
A
1328}
1329
1330/*
1331 * List manipulation
1332 */
1333
39236c6e
A
1334int
1335memorystatus_add(proc_t p, boolean_t locked)
316670eb 1336{
39236c6e
A
1337 memstat_bucket_t *bucket;
1338
3e170ce0 1339 MEMORYSTATUS_DEBUG(1, "memorystatus_list_add(): adding pid %d with priority %d.\n", p->p_pid, p->p_memstat_effectivepriority);
39236c6e
A
1340
1341 if (!locked) {
1342 proc_list_lock();
1343 }
1344
1345 /* Processes marked internal do not have priority tracked */
1346 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
1347 goto exit;
1348 }
1349
1350 bucket = &memstat_bucket[p->p_memstat_effectivepriority];
1351
fe8ab488
A
1352 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1353 assert(bucket->count == memorystatus_scheduled_idle_demotions);
1354 }
1355
39236c6e
A
1356 TAILQ_INSERT_TAIL(&bucket->list, p, p_memstat_list);
1357 bucket->count++;
316670eb 1358
39236c6e 1359 memorystatus_list_count++;
316670eb 1360
39236c6e
A
1361 memorystatus_check_levels_locked();
1362
1363exit:
1364 if (!locked) {
1365 proc_list_unlock();
1366 }
1367
1368 return 0;
1369}
316670eb 1370
3e170ce0
A
1371/*
1372 * Description:
1373 * Moves a process from one jetsam bucket to another.
1374 * which changes the LRU position of the process.
1375 *
1376 * Monitors transition between buckets and if necessary
1377 * will update cached memory limits accordingly.
1378 */
39236c6e 1379static void
fe8ab488 1380memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert)
39236c6e
A
1381{
1382 memstat_bucket_t *old_bucket, *new_bucket;
1383
1384 assert(priority < MEMSTAT_BUCKET_COUNT);
1385
1386 /* Ensure that exit isn't underway, leaving the proc retained but removed from its bucket */
1387 if ((p->p_listflag & P_LIST_EXITED) != 0) {
1388 return;
316670eb 1389 }
39236c6e 1390
3e170ce0 1391 MEMORYSTATUS_DEBUG(1, "memorystatus_update_priority_locked(): setting pid %d to priority %d, inserting at %s\n",
fe8ab488 1392 p->p_pid, priority, head_insert ? "head" : "tail");
316670eb 1393
39236c6e 1394 old_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
fe8ab488
A
1395 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1396 assert(old_bucket->count == (memorystatus_scheduled_idle_demotions + 1));
1397 }
1398
39236c6e
A
1399 TAILQ_REMOVE(&old_bucket->list, p, p_memstat_list);
1400 old_bucket->count--;
316670eb 1401
39236c6e 1402 new_bucket = &memstat_bucket[priority];
fe8ab488
A
1403 if (head_insert)
1404 TAILQ_INSERT_HEAD(&new_bucket->list, p, p_memstat_list);
1405 else
1406 TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
39236c6e 1407 new_bucket->count++;
3e170ce0 1408
39236c6e 1409#if CONFIG_JETSAM
3e170ce0
A
1410 if (memorystatus_highwater_enabled) {
1411 boolean_t trigger_exception;
1412
1413 /*
1414 * If cached limit data is updated, then the limits
1415 * will be enforced by writing to the ledgers.
1416 */
1417 boolean_t ledger_update_needed = TRUE;
fe8ab488
A
1418
1419 /*
3e170ce0
A
1420 * No need to consider P_MEMSTAT_MEMLIMIT_BACKGROUND anymore.
1421 * Background limits are described via the inactive limit slots.
1422 *
1423 * Here, we must update the cached memory limit if the task
1424 * is transitioning between:
1425 * active <--> inactive
1426 * FG <--> BG
1427 * but:
1428 * dirty <--> clean is ignored
1429 *
1430 * We bypass processes that have opted into dirty tracking because
1431 * a move between buckets does not imply a transition between the
1432 * dirty <--> clean state.
1433 * Setting limits on processes opted into dirty tracking is handled
1434 * in memorystatus_dirty_set() where the transition is very clear.
fe8ab488
A
1435 */
1436
3e170ce0
A
1437 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
1438
1439 ledger_update_needed = FALSE;
1440
1441 } else if ((priority >= JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority < JETSAM_PRIORITY_FOREGROUND)) {
1442 /*
1443 * inactive --> active
1444 * BG --> FG
1445 * assign active state
1446 */
1447 CACHE_ACTIVE_LIMITS_LOCKED(p, trigger_exception);
1448
1449 } else if ((priority < JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) {
1450 /*
1451 * active --> inactive
1452 * FG --> BG
1453 * assign inactive state
1454 */
1455 CACHE_INACTIVE_LIMITS_LOCKED(p, trigger_exception);
1456 } else {
1457 /*
1458 * The transition between jetsam priority buckets apparently did
1459 * not affect active/inactive state.
1460 * This is not unusual... especially during startup when
1461 * processes are getting established in their respective bands.
1462 */
1463 ledger_update_needed = FALSE;
1464 }
1465
1466 /*
1467 * Enforce the new limits by writing to the ledger
1468 */
1469 if (ledger_update_needed) {
1470 task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, trigger_exception);
1471
1472 MEMORYSTATUS_DEBUG(3, "memorystatus_update_priority_locked: new limit on pid %d (%dMB %s) priority old --> new (%d --> %d) dirty?=0x%x %s\n",
1473 p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
1474 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, priority, p->p_memstat_dirty,
1475 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
39236c6e
A
1476 }
1477 }
3e170ce0
A
1478
1479#endif /* CONFIG_JETSAM */
39236c6e
A
1480
1481 p->p_memstat_effectivepriority = priority;
1482
1483 memorystatus_check_levels_locked();
316670eb
A
1484}
1485
3e170ce0
A
1486/*
1487 *
1488 * Description: Update the jetsam priority and memory limit attributes for a given process.
1489 *
1490 * Parameters:
1491 * p init this process's jetsam information.
1492 * priority The jetsam priority band
1493 * user_data user specific data, unused by the kernel
1494 * effective guards against race if process's update already occurred
1495 * update_memlimit When true we know this is the init step via the posix_spawn path.
1496 *
1497 * memlimit_active Value in megabytes; The monitored footprint level while the
1498 * process is active. Exceeding it may result in termination
1499 * based on it's associated fatal flag.
1500 *
1501 * memlimit_active_is_fatal When a process is active and exceeds its memory footprint,
1502 * this describes whether or not it should be immediately fatal.
1503 *
1504 * memlimit_inactive Value in megabytes; The monitored footprint level while the
1505 * process is inactive. Exceeding it may result in termination
1506 * based on it's associated fatal flag.
1507 *
1508 * memlimit_inactive_is_fatal When a process is inactive and exceeds its memory footprint,
1509 * this describes whether or not it should be immediatly fatal.
1510 *
1511 * memlimit_background This process has a high-water-mark while in the background.
1512 * No longer meaningful. Background limits are described via
1513 * the inactive slots. Flag is ignored.
1514 *
1515 *
1516 * Returns: 0 Success
1517 * non-0 Failure
1518 */
1519
39236c6e 1520int
3e170ce0
A
1521memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t effective, boolean_t update_memlimit,
1522 int32_t memlimit_active, boolean_t memlimit_active_is_fatal,
1523 int32_t memlimit_inactive, boolean_t memlimit_inactive_is_fatal,
1524 __unused boolean_t memlimit_background)
316670eb 1525{
39236c6e 1526 int ret;
fe8ab488 1527 boolean_t head_insert = false;
39236c6e 1528
316670eb 1529#if !CONFIG_JETSAM
3e170ce0
A
1530#pragma unused(update_memlimit, memlimit_active, memlimit_inactive)
1531#pragma unused(memlimit_active_is_fatal, memlimit_inactive_is_fatal)
1532#endif /* !CONFIG_JETSAM */
1533
1534 MEMORYSTATUS_DEBUG(1, "memorystatus_update: changing pid %d: priority %d, user_data 0x%llx\n", p->p_pid, priority, user_data);
316670eb 1535
39236c6e
A
1536 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_START, p->p_pid, priority, user_data, effective, 0);
1537
1538 if (priority == -1) {
1539 /* Use as shorthand for default priority */
1540 priority = JETSAM_PRIORITY_DEFAULT;
1541 } else if (priority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1542 /* JETSAM_PRIORITY_IDLE_DEFERRED is reserved for internal use; if requested, adjust to JETSAM_PRIORITY_IDLE. */
1543 priority = JETSAM_PRIORITY_IDLE;
fe8ab488
A
1544 } else if (priority == JETSAM_PRIORITY_IDLE_HEAD) {
1545 /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle queue */
1546 priority = JETSAM_PRIORITY_IDLE;
3e170ce0 1547 head_insert = TRUE;
39236c6e
A
1548 } else if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) {
1549 /* Sanity check */
1550 ret = EINVAL;
1551 goto out;
316670eb 1552 }
3e170ce0 1553
39236c6e
A
1554 proc_list_lock();
1555
1556 assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
316670eb 1557
39236c6e
A
1558 if (effective && (p->p_memstat_state & P_MEMSTAT_PRIORITYUPDATED)) {
1559 ret = EALREADY;
1560 proc_list_unlock();
fe8ab488
A
1561 MEMORYSTATUS_DEBUG(1, "memorystatus_update: effective change specified for pid %d, but change already occurred.\n", p->p_pid);
1562 goto out;
1563 }
1564
1565 if ((p->p_memstat_state & P_MEMSTAT_TERMINATED) || ((p->p_listflag & P_LIST_EXITED) != 0)) {
1566 /*
1567 * This could happen when a process calling posix_spawn() is exiting on the jetsam thread.
1568 */
1569 ret = EBUSY;
1570 proc_list_unlock();
316670eb
A
1571 goto out;
1572 }
1573
39236c6e
A
1574 p->p_memstat_state |= P_MEMSTAT_PRIORITYUPDATED;
1575 p->p_memstat_userdata = user_data;
1576 p->p_memstat_requestedpriority = priority;
1577
1578#if CONFIG_JETSAM
1579 if (update_memlimit) {
3e170ce0
A
1580 boolean_t trigger_exception;
1581
1582 /*
1583 * Posix_spawn'd processes come through this path to instantiate ledger limits.
1584 * Forked processes do not come through this path, so no ledger limits exist.
1585 * (That's why forked processes can consume unlimited memory.)
1586 */
1587
1588 MEMORYSTATUS_DEBUG(3, "memorystatus_update(enter): pid %d, priority %d, dirty=0x%x, Active(%dMB %s), Inactive(%dMB, %s)\n",
1589 p->p_pid, priority, p->p_memstat_dirty,
1590 memlimit_active, (memlimit_active_is_fatal ? "F " : "NF"),
1591 memlimit_inactive, (memlimit_inactive_is_fatal ? "F " : "NF"));
1592
39236c6e 1593 if (memlimit_background) {
fe8ab488 1594
3e170ce0
A
1595 /*
1596 * With 2-level HWM support, we no longer honor P_MEMSTAT_MEMLIMIT_BACKGROUND.
1597 * Background limits are described via the inactive limit slots.
1598 */
1599
1600 // p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_BACKGROUND;
fe8ab488 1601
3e170ce0
A
1602#if DEVELOPMENT || DEBUG
1603 printf("memorystatus_update: WARNING %s[%d] set unused flag P_MEMSTAT_MEMLIMIT_BACKGROUND [A==%dMB %s] [IA==%dMB %s]\n",
1604 (p->p_comm ? p->p_comm : "unknown"), p->p_pid,
1605 memlimit_active, (memlimit_active_is_fatal ? "F " : "NF"),
1606 memlimit_inactive, (memlimit_inactive_is_fatal ? "F " : "NF"));
1607#endif /* DEVELOPMENT || DEBUG */
1608 }
1609
1610 if (memlimit_active <= 0) {
1611 /*
1612 * This process will have a system_wide task limit when active.
1613 * System_wide task limit is always fatal.
1614 * It's quite common to see non-fatal flag passed in here.
1615 * It's not an error, we just ignore it.
1616 */
1617
1618 /*
1619 * For backward compatibility with some unexplained launchd behavior,
1620 * we allow a zero sized limit. But we still enforce system_wide limit
1621 * when written to the ledgers.
1622 */
1623
1624 if (memlimit_active < 0) {
1625 memlimit_active = -1; /* enforces system_wide task limit */
39236c6e 1626 }
3e170ce0 1627 memlimit_active_is_fatal = TRUE;
316670eb 1628 }
3e170ce0
A
1629
1630 if (memlimit_inactive <= 0) {
1631 /*
1632 * This process will have a system_wide task limit when inactive.
1633 * System_wide task limit is always fatal.
1634 */
1635
1636 memlimit_inactive = -1;
1637 memlimit_inactive_is_fatal = TRUE;
fe8ab488 1638 }
316670eb 1639
3e170ce0
A
1640 /*
1641 * Initialize the active limit variants for this process.
1642 */
1643 SET_ACTIVE_LIMITS_LOCKED(p, memlimit_active, memlimit_active_is_fatal);
1644
1645 /*
1646 * Initialize the inactive limit variants for this process.
1647 */
1648 SET_INACTIVE_LIMITS_LOCKED(p, memlimit_inactive, memlimit_inactive_is_fatal);
1649
1650 /*
1651 * Initialize the cached limits for target process.
1652 * When the target process is dirty tracked, it's typically
1653 * in a clean state. Non dirty tracked processes are
1654 * typically active (Foreground or above).
1655 * But just in case, we don't make assumptions...
1656 */
1657
1658 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
1659 CACHE_ACTIVE_LIMITS_LOCKED(p, trigger_exception);
1660 } else {
1661 CACHE_INACTIVE_LIMITS_LOCKED(p, trigger_exception);
1662 }
1663
1664 /*
1665 * Enforce the cached limit by writing to the ledger.
1666 */
1667 if (memorystatus_highwater_enabled) {
1668 /* apply now */
1669 assert(trigger_exception == TRUE);
1670 task_set_phys_footprint_limit_internal(p->task, ((p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1), NULL, trigger_exception);
1671
1672 MEMORYSTATUS_DEBUG(3, "memorystatus_update: init: limit on pid %d (%dMB %s) targeting priority(%d) dirty?=0x%x %s\n",
1673 p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
1674 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), priority, p->p_memstat_dirty,
1675 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
1676 }
1677 }
1678#endif /* CONFIG_JETSAM */
1679
1680 /*
1681 * We can't add to the JETSAM_PRIORITY_IDLE_DEFERRED bucket here.
1682 * But, we could be removing it from the bucket.
1683 * Check and take appropriate steps if so.
1684 */
1685
1686 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1687
fe8ab488
A
1688 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1689 }
1690
1691 memorystatus_update_priority_locked(p, priority, head_insert);
39236c6e
A
1692
1693 proc_list_unlock();
1694 ret = 0;
316670eb
A
1695
1696out:
39236c6e
A
1697 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_END, ret, 0, 0, 0, 0);
1698
316670eb
A
1699 return ret;
1700}
1701
39236c6e
A
1702int
1703memorystatus_remove(proc_t p, boolean_t locked)
316670eb 1704{
39236c6e
A
1705 int ret;
1706 memstat_bucket_t *bucket;
316670eb 1707
3e170ce0 1708 MEMORYSTATUS_DEBUG(1, "memorystatus_list_remove: removing pid %d\n", p->p_pid);
316670eb 1709
39236c6e
A
1710 if (!locked) {
1711 proc_list_lock();
1712 }
316670eb 1713
39236c6e 1714 assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
fe8ab488 1715
39236c6e 1716 bucket = &memstat_bucket[p->p_memstat_effectivepriority];
fe8ab488
A
1717 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1718 assert(bucket->count == memorystatus_scheduled_idle_demotions);
1719 }
1720
39236c6e
A
1721 TAILQ_REMOVE(&bucket->list, p, p_memstat_list);
1722 bucket->count--;
1723
1724 memorystatus_list_count--;
316670eb 1725
39236c6e
A
1726 /* If awaiting demotion to the idle band, clean up */
1727 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1728 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1729 memorystatus_reschedule_idle_demotion_locked();
1730 }
316670eb 1731
39236c6e
A
1732 memorystatus_check_levels_locked();
1733
1734#if CONFIG_FREEZE
1735 if (p->p_memstat_state & (P_MEMSTAT_FROZEN)) {
1736 memorystatus_frozen_count--;
1737 }
316670eb 1738
39236c6e
A
1739 if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
1740 memorystatus_suspended_footprint_total -= p->p_memstat_suspendedfootprint;
1741 memorystatus_suspended_count--;
316670eb 1742 }
39236c6e
A
1743#endif
1744
1745 if (!locked) {
1746 proc_list_unlock();
1747 }
316670eb 1748
39236c6e
A
1749 if (p) {
1750 ret = 0;
316670eb 1751 } else {
39236c6e 1752 ret = ESRCH;
316670eb
A
1753 }
1754
1755 return ret;
1756}
1757
3e170ce0
A
1758/*
1759 * Validate dirty tracking flags with process state.
1760 *
1761 * Return:
1762 * 0 on success
1763 * non-0 on failure
1764 */
1765
1766static int
39236c6e
A
1767memorystatus_validate_track_flags(struct proc *target_p, uint32_t pcontrol) {
1768 /* See that the process isn't marked for termination */
1769 if (target_p->p_memstat_dirty & P_DIRTY_TERMINATED) {
3e170ce0 1770 return EBUSY;
316670eb
A
1771 }
1772
39236c6e
A
1773 /* Idle exit requires that process be tracked */
1774 if ((pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) &&
1775 !(pcontrol & PROC_DIRTY_TRACK)) {
3e170ce0 1776 return EINVAL;
39236c6e
A
1777 }
1778
fe8ab488
A
1779 /* 'Launch in progress' tracking requires that process have enabled dirty tracking too. */
1780 if ((pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) &&
1781 !(pcontrol & PROC_DIRTY_TRACK)) {
3e170ce0 1782 return EINVAL;
fe8ab488
A
1783 }
1784
39236c6e
A
1785 /* Deferral is only relevant if idle exit is specified */
1786 if ((pcontrol & PROC_DIRTY_DEFER) &&
1787 !(pcontrol & PROC_DIRTY_ALLOWS_IDLE_EXIT)) {
3e170ce0 1788 return EINVAL;
316670eb
A
1789 }
1790
3e170ce0 1791 return(0);
316670eb 1792}
593a1d5f 1793
39236c6e
A
1794static void
1795memorystatus_update_idle_priority_locked(proc_t p) {
1796 int32_t priority;
3e170ce0 1797
39236c6e
A
1798 MEMORYSTATUS_DEBUG(1, "memorystatus_update_idle_priority_locked(): pid %d dirty 0x%X\n", p->p_pid, p->p_memstat_dirty);
1799
1800 if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED) {
1801 priority = (p->p_memstat_dirty & P_DIRTY_DEFER_IN_PROGRESS) ? JETSAM_PRIORITY_IDLE_DEFERRED : JETSAM_PRIORITY_IDLE;
1802 } else {
1803 priority = p->p_memstat_requestedpriority;
1804 }
1805
fe8ab488
A
1806 if (priority != p->p_memstat_effectivepriority) {
1807 memorystatus_update_priority_locked(p, priority, false);
1808 }
39236c6e
A
1809}
1810
1811/*
1812 * Processes can opt to have their state tracked by the kernel, indicating when they are busy (dirty) or idle
1813 * (clean). They may also indicate that they support termination when idle, with the result that they are promoted
1814 * to their desired, higher, jetsam priority when dirty (and are therefore killed later), and demoted to the low
1815 * priority idle band when clean (and killed earlier, protecting higher priority procesess).
1816 *
1817 * If the deferral flag is set, then newly tracked processes will be protected for an initial period (as determined by
1818 * memorystatus_idle_delay_time); if they go clean during this time, then they will be moved to a deferred-idle band
1819 * with a slightly higher priority, guarding against immediate termination under memory pressure and being unable to
1820 * make forward progress. Finally, when the guard expires, they will be moved to the standard, lowest-priority, idle
1821 * band. The deferral can be cleared early by clearing the appropriate flag.
1822 *
1823 * The deferral timer is active only for the duration that the process is marked as guarded and clean; if the process
1824 * is marked dirty, the timer will be cancelled. Upon being subsequently marked clean, the deferment will either be
1825 * re-enabled or the guard state cleared, depending on whether the guard deadline has passed.
1826 */
1827
1828int
1829memorystatus_dirty_track(proc_t p, uint32_t pcontrol) {
1830 unsigned int old_dirty;
1831 boolean_t reschedule = FALSE;
fe8ab488
A
1832 boolean_t already_deferred = FALSE;
1833 boolean_t defer_now = FALSE;
3e170ce0 1834 int ret = 0;
39236c6e 1835
fe8ab488
A
1836 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_TRACK),
1837 p->p_pid, p->p_memstat_dirty, pcontrol, 0, 0);
1838
39236c6e 1839 proc_list_lock();
316670eb 1840
fe8ab488
A
1841 if ((p->p_listflag & P_LIST_EXITED) != 0) {
1842 /*
1843 * Process is on its way out.
1844 */
1845 ret = EBUSY;
1846 goto exit;
1847 }
1848
39236c6e
A
1849 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
1850 ret = EPERM;
1851 goto exit;
316670eb
A
1852 }
1853
3e170ce0
A
1854 if ((ret = memorystatus_validate_track_flags(p, pcontrol)) != 0) {
1855 /* error */
39236c6e 1856 goto exit;
3e170ce0 1857 }
39236c6e
A
1858
1859 old_dirty = p->p_memstat_dirty;
1860
1861 /* These bits are cumulative, as per <rdar://problem/11159924> */
1862 if (pcontrol & PROC_DIRTY_TRACK) {
1863 p->p_memstat_dirty |= P_DIRTY_TRACK;
1864 }
1865
1866 if (pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) {
1867 p->p_memstat_dirty |= P_DIRTY_ALLOW_IDLE_EXIT;
1868 }
1869
fe8ab488
A
1870 if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
1871 p->p_memstat_dirty |= P_DIRTY_LAUNCH_IN_PROGRESS;
1872 }
1873
1874 if (old_dirty & P_DIRTY_DEFER_IN_PROGRESS) {
1875 already_deferred = TRUE;
1876 }
1877
39236c6e 1878 /* This can be set and cleared exactly once. */
fe8ab488
A
1879 if (pcontrol & PROC_DIRTY_DEFER) {
1880
1881 if ( !(old_dirty & P_DIRTY_DEFER)) {
1882 p->p_memstat_dirty |= P_DIRTY_DEFER;
1883 }
1884
1885 defer_now = TRUE;
39236c6e
A
1886 }
1887
3e170ce0 1888 MEMORYSTATUS_DEBUG(1, "memorystatus_on_track_dirty(): set idle-exit %s / defer %s / dirty %s for pid %d\n",
39236c6e 1889 ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) ? "Y" : "N",
fe8ab488 1890 defer_now ? "Y" : "N",
39236c6e
A
1891 p->p_memstat_dirty & P_DIRTY ? "Y" : "N",
1892 p->p_pid);
1893
1894 /* Kick off or invalidate the idle exit deferment if there's a state transition. */
1895 if (!(p->p_memstat_dirty & P_DIRTY_IS_DIRTY)) {
1896 if (((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) &&
fe8ab488
A
1897 defer_now && !already_deferred) {
1898
1899 /*
1900 * Request to defer a clean process that's idle-exit enabled
1901 * and not already in the jetsam deferred band.
1902 */
39236c6e
A
1903 memorystatus_schedule_idle_demotion_locked(p, TRUE);
1904 reschedule = TRUE;
fe8ab488
A
1905
1906 } else if (!defer_now && already_deferred) {
1907
1908 /*
1909 * Either the process is no longer idle-exit enabled OR
1910 * there's a request to cancel a currently active deferral.
1911 */
1912 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1913 reschedule = TRUE;
1914 }
1915 } else {
1916
1917 /*
1918 * We are trying to operate on a dirty process. Dirty processes have to
1919 * be removed from the deferred band. The question is do we reset the
1920 * deferred state or not?
1921 *
1922 * This could be a legal request like:
1923 * - this process had opted into the JETSAM_DEFERRED band
1924 * - but it's now dirty and requests to opt out.
1925 * In this case, we remove the process from the band and reset its
1926 * state too. It'll opt back in properly when needed.
1927 *
1928 * OR, this request could be a user-space bug. E.g.:
1929 * - this process had opted into the JETSAM_DEFERRED band when clean
1930 * - and, then issues another request to again put it into the band except
1931 * this time the process is dirty.
1932 * The process going dirty, as a transition in memorystatus_dirty_set(), will pull the process out of
1933 * the deferred band with its state intact. So our request below is no-op.
1934 * But we do it here anyways for coverage.
1935 *
1936 * memorystatus_update_idle_priority_locked()
1937 * single-mindedly treats a dirty process as "cannot be in the deferred band".
1938 */
1939
1940 if (!defer_now && already_deferred) {
39236c6e
A
1941 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1942 reschedule = TRUE;
fe8ab488
A
1943 } else {
1944 memorystatus_invalidate_idle_demotion_locked(p, FALSE);
1945 reschedule = TRUE;
316670eb
A
1946 }
1947 }
39236c6e
A
1948
1949 memorystatus_update_idle_priority_locked(p);
1950
1951 if (reschedule) {
1952 memorystatus_reschedule_idle_demotion_locked();
1953 }
1954
1955 ret = 0;
316670eb 1956
39236c6e
A
1957exit:
1958 proc_list_unlock();
316670eb
A
1959
1960 return ret;
1961}
2d21ac55 1962
39236c6e
A
1963int
1964memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) {
1965 int ret;
1966 boolean_t kill = false;
1967 boolean_t reschedule = FALSE;
1968 boolean_t was_dirty = FALSE;
1969 boolean_t now_dirty = FALSE;
6d2010ae 1970
39236c6e 1971 MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_set(): %d %d 0x%x 0x%x\n", self, p->p_pid, pcontrol, p->p_memstat_dirty);
fe8ab488
A
1972
1973 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_SET), p->p_pid, self, pcontrol, 0, 0);
b0d623f7 1974
39236c6e
A
1975 proc_list_lock();
1976
fe8ab488
A
1977 if ((p->p_listflag & P_LIST_EXITED) != 0) {
1978 /*
1979 * Process is on its way out.
1980 */
1981 ret = EBUSY;
1982 goto exit;
1983 }
1984
39236c6e
A
1985 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
1986 ret = EPERM;
1987 goto exit;
1988 }
1989
1990 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY)
1991 was_dirty = TRUE;
1992
1993 if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
1994 /* Dirty tracking not enabled */
1995 ret = EINVAL;
1996 } else if (pcontrol && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
1997 /*
1998 * Process is set to be terminated and we're attempting to mark it dirty.
1999 * Set for termination and marking as clean is OK - see <rdar://problem/10594349>.
2000 */
2001 ret = EBUSY;
2002 } else {
2003 int flag = (self == TRUE) ? P_DIRTY : P_DIRTY_SHUTDOWN;
2004 if (pcontrol && !(p->p_memstat_dirty & flag)) {
2005 /* Mark the process as having been dirtied at some point */
2006 p->p_memstat_dirty |= (flag | P_DIRTY_MARKED);
2007 memorystatus_dirty_count++;
2008 ret = 0;
2009 } else if ((pcontrol == 0) && (p->p_memstat_dirty & flag)) {
3e170ce0 2010 if ((flag == P_DIRTY_SHUTDOWN) && (!(p->p_memstat_dirty & P_DIRTY))) {
39236c6e
A
2011 /* Clearing the dirty shutdown flag, and the process is otherwise clean - kill */
2012 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
2013 kill = true;
2014 } else if ((flag == P_DIRTY) && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
2015 /* Kill previously terminated processes if set clean */
2016 kill = true;
2017 }
2018 p->p_memstat_dirty &= ~flag;
2019 memorystatus_dirty_count--;
2020 ret = 0;
2021 } else {
2022 /* Already set */
2023 ret = EALREADY;
316670eb 2024 }
39236c6e
A
2025 }
2026
2027 if (ret != 0) {
2028 goto exit;
2029 }
3e170ce0 2030
39236c6e
A
2031 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY)
2032 now_dirty = TRUE;
2033
2034 if ((was_dirty == TRUE && now_dirty == FALSE) ||
2035 (was_dirty == FALSE && now_dirty == TRUE)) {
2036
2037 /* Manage idle exit deferral, if applied */
2038 if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_DEFER_IN_PROGRESS)) ==
2039 (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_DEFER_IN_PROGRESS)) {
fe8ab488
A
2040
2041 /*
2042 * P_DIRTY_DEFER_IN_PROGRESS means the process is in the deferred band OR it might be heading back
2043 * there once it's clean again and has some protection window left.
2044 */
2045
39236c6e 2046 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
fe8ab488
A
2047 /*
2048 * New dirty process i.e. "was_dirty == FALSE && now_dirty == TRUE"
2049 *
2050 * The process will move from the deferred band to its higher requested
2051 * jetsam band. But we don't clear its state i.e. we want to remember that
2052 * this process was part of the "deferred" band and will return to it.
2053 *
2054 * This way, we don't let it age beyond the protection
2055 * window when it returns to "clean". All the while giving
2056 * it a chance to perform its work while "dirty".
2057 *
2058 */
39236c6e
A
2059 memorystatus_invalidate_idle_demotion_locked(p, FALSE);
2060 reschedule = TRUE;
2061 } else {
fe8ab488
A
2062
2063 /*
2064 * Process is back from "dirty" to "clean".
2065 *
2066 * Is its timer up OR does it still have some protection
2067 * window left?
2068 */
2069
39236c6e 2070 if (mach_absolute_time() >= p->p_memstat_idledeadline) {
fe8ab488
A
2071 /*
2072 * The process' deadline has expired. It currently
2073 * does not reside in the DEFERRED bucket.
2074 *
2075 * It's on its way to the JETSAM_PRIORITY_IDLE
2076 * bucket via memorystatus_update_idle_priority_locked()
2077 * below.
2078
2079 * So all we need to do is reset all the state on the
2080 * process that's related to the DEFERRED bucket i.e.
2081 * the DIRTY_DEFER_IN_PROGRESS flag and the timer deadline.
2082 *
2083 */
2084
2085 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
2086 reschedule = TRUE;
39236c6e 2087 } else {
fe8ab488
A
2088 /*
2089 * It still has some protection window left and so
2090 * we just re-arm the timer without modifying any
2091 * state on the process.
2092 */
39236c6e
A
2093 memorystatus_schedule_idle_demotion_locked(p, FALSE);
2094 reschedule = TRUE;
2095 }
2096 }
2097 }
3e170ce0 2098
39236c6e 2099 memorystatus_update_idle_priority_locked(p);
3e170ce0
A
2100
2101#if CONFIG_JETSAM
2102 if (memorystatus_highwater_enabled) {
2103 boolean_t trigger_exception;
2104 /*
2105 * We are in this path because this process transitioned between
2106 * dirty <--> clean state. Update the cached memory limits.
2107 */
2108
2109 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
2110 /*
2111 * process is dirty
2112 */
2113 CACHE_ACTIVE_LIMITS_LOCKED(p, trigger_exception);
2114 } else {
2115 /*
2116 * process is clean
2117 */
2118 CACHE_INACTIVE_LIMITS_LOCKED(p, trigger_exception);
2119 }
2120
2121 /*
2122 * Enforce the new limits by writing to the ledger.
2123 *
2124 * This is a hot path and holding the proc_list_lock while writing to the ledgers,
2125 * (where the task lock is taken) is bad. So, we temporarily drop the proc_list_lock.
2126 * We aren't traversing the jetsam bucket list here, so we should be safe.
2127 * See rdar://21394491.
2128 */
2129
2130 if (proc_ref_locked(p) == p) {
2131 int ledger_limit;
2132 if (p->p_memstat_memlimit > 0) {
2133 ledger_limit = p->p_memstat_memlimit;
2134 } else {
2135 ledger_limit = -1;
2136 }
2137 proc_list_unlock();
2138 task_set_phys_footprint_limit_internal(p->task, ledger_limit, NULL, trigger_exception);
2139 proc_list_lock();
2140 proc_rele_locked(p);
2141
2142 MEMORYSTATUS_DEBUG(3, "memorystatus_dirty_set: new limit on pid %d (%dMB %s) priority(%d) dirty?=0x%x %s\n",
2143 p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
2144 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, p->p_memstat_dirty,
2145 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
2146 }
2147
2148 }
2149#endif /* CONFIG_JETSAM */
39236c6e
A
2150
2151 /* If the deferral state changed, reschedule the demotion timer */
2152 if (reschedule) {
2153 memorystatus_reschedule_idle_demotion_locked();
2154 }
2155 }
3e170ce0 2156
39236c6e 2157 if (kill) {
3e170ce0
A
2158 if (proc_ref_locked(p) == p) {
2159 proc_list_unlock();
2160 psignal(p, SIGKILL);
2161 proc_list_lock();
2162 proc_rele_locked(p);
2163 }
39236c6e
A
2164 }
2165
2166exit:
2167 proc_list_unlock();
2168
2169 return ret;
2170}
b0d623f7 2171
39236c6e 2172int
fe8ab488
A
2173memorystatus_dirty_clear(proc_t p, uint32_t pcontrol) {
2174
39236c6e 2175 int ret = 0;
fe8ab488
A
2176
2177 MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_clear(): %d 0x%x 0x%x\n", p->p_pid, pcontrol, p->p_memstat_dirty);
39236c6e 2178
fe8ab488
A
2179 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_CLEAR), p->p_pid, pcontrol, 0, 0, 0);
2180
2181 proc_list_lock();
2182
2183 if ((p->p_listflag & P_LIST_EXITED) != 0) {
2184 /*
2185 * Process is on its way out.
2186 */
2187 ret = EBUSY;
2188 goto exit;
2189 }
2190
2191 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
2192 ret = EPERM;
2193 goto exit;
2194 }
2195
2196 if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
2197 /* Dirty tracking not enabled */
2198 ret = EINVAL;
2199 goto exit;
2200 }
2201
2202 if (!pcontrol || (pcontrol & (PROC_DIRTY_LAUNCH_IN_PROGRESS | PROC_DIRTY_DEFER)) == 0) {
2203 ret = EINVAL;
2204 goto exit;
2205 }
2206
2207 if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
2208 p->p_memstat_dirty &= ~P_DIRTY_LAUNCH_IN_PROGRESS;
2209 }
2210
2211 /* This can be set and cleared exactly once. */
2212 if (pcontrol & PROC_DIRTY_DEFER) {
2213
2214 if (p->p_memstat_dirty & P_DIRTY_DEFER) {
2215
2216 p->p_memstat_dirty &= ~P_DIRTY_DEFER;
2217
2218 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
2219 memorystatus_update_idle_priority_locked(p);
2220 memorystatus_reschedule_idle_demotion_locked();
2221 }
2222 }
2223
2224 ret = 0;
2225exit:
2226 proc_list_unlock();
2227
2228 return ret;
2229}
2230
2231int
2232memorystatus_dirty_get(proc_t p) {
2233 int ret = 0;
2234
2235 proc_list_lock();
2236
2237 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
39236c6e
A
2238 ret |= PROC_DIRTY_TRACKED;
2239 if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) {
2240 ret |= PROC_DIRTY_ALLOWS_IDLE_EXIT;
2241 }
2242 if (p->p_memstat_dirty & P_DIRTY) {
2243 ret |= PROC_DIRTY_IS_DIRTY;
2244 }
fe8ab488
A
2245 if (p->p_memstat_dirty & P_DIRTY_LAUNCH_IN_PROGRESS) {
2246 ret |= PROC_DIRTY_LAUNCH_IS_IN_PROGRESS;
2247 }
39236c6e
A
2248 }
2249
2250 proc_list_unlock();
2251
2252 return ret;
2253}
b0d623f7 2254
39236c6e
A
2255int
2256memorystatus_on_terminate(proc_t p) {
2257 int sig;
2258
2259 proc_list_lock();
2260
2261 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
2262
2263 if ((p->p_memstat_dirty & (P_DIRTY_TRACK|P_DIRTY_IS_DIRTY)) == P_DIRTY_TRACK) {
2264 /* Clean; mark as terminated and issue SIGKILL */
2265 sig = SIGKILL;
2266 } else {
2267 /* Dirty, terminated, or state tracking is unsupported; issue SIGTERM to allow cleanup */
2268 sig = SIGTERM;
316670eb 2269 }
39236c6e
A
2270
2271 proc_list_unlock();
2272
2273 return sig;
316670eb 2274}
b0d623f7 2275
316670eb 2276void
39236c6e
A
2277memorystatus_on_suspend(proc_t p)
2278{
316670eb 2279#if CONFIG_FREEZE
39236c6e 2280 uint32_t pages;
fe8ab488 2281 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
39236c6e
A
2282#endif
2283 proc_list_lock();
2284#if CONFIG_FREEZE
2285 p->p_memstat_suspendedfootprint = pages;
2286 memorystatus_suspended_footprint_total += pages;
2287 memorystatus_suspended_count++;
316670eb 2288#endif
39236c6e
A
2289 p->p_memstat_state |= P_MEMSTAT_SUSPENDED;
2290 proc_list_unlock();
2291}
6d2010ae 2292
39236c6e
A
2293void
2294memorystatus_on_resume(proc_t p)
2295{
2296#if CONFIG_FREEZE
2297 boolean_t frozen;
2298 pid_t pid;
2299#endif
6d2010ae 2300
39236c6e 2301 proc_list_lock();
6d2010ae 2302
316670eb 2303#if CONFIG_FREEZE
39236c6e
A
2304 frozen = (p->p_memstat_state & P_MEMSTAT_FROZEN);
2305 if (frozen) {
2306 memorystatus_frozen_count--;
2307 p->p_memstat_state |= P_MEMSTAT_PRIOR_THAW;
2308 }
2309
2310 memorystatus_suspended_footprint_total -= p->p_memstat_suspendedfootprint;
2311 memorystatus_suspended_count--;
2312
2313 pid = p->p_pid;
316670eb 2314#endif
39236c6e
A
2315
2316 p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN);
2317
2318 proc_list_unlock();
2319
2320#if CONFIG_FREEZE
2321 if (frozen) {
2322 memorystatus_freeze_entry_t data = { pid, FALSE, 0 };
2323 memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
316670eb 2324 }
39236c6e 2325#endif
316670eb 2326}
6d2010ae 2327
316670eb 2328void
39236c6e 2329memorystatus_on_inactivity(proc_t p)
6d2010ae 2330{
39236c6e 2331#pragma unused(p)
316670eb
A
2332#if CONFIG_FREEZE
2333 /* Wake the freeze thread */
2334 thread_wakeup((event_t)&memorystatus_freeze_wakeup);
2335#endif
2336}
6d2010ae 2337
39236c6e
A
2338static uint32_t
2339memorystatus_build_state(proc_t p) {
2340 uint32_t snapshot_state = 0;
2341
2342 /* General */
2343 if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
2344 snapshot_state |= kMemorystatusSuspended;
2345 }
2346 if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
2347 snapshot_state |= kMemorystatusFrozen;
2348 }
2349 if (p->p_memstat_state & P_MEMSTAT_PRIOR_THAW) {
2350 snapshot_state |= kMemorystatusWasThawed;
2351 }
2352
2353 /* Tracking */
2354 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
2355 snapshot_state |= kMemorystatusTracked;
2356 }
2357 if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
2358 snapshot_state |= kMemorystatusSupportsIdleExit;
2359 }
2360 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
2361 snapshot_state |= kMemorystatusDirty;
2362 }
2363
2364 return snapshot_state;
2365}
2366
2367#if !CONFIG_JETSAM
2368
2369static boolean_t
2370kill_idle_exit_proc(void)
316670eb 2371{
39236c6e 2372 proc_t p, victim_p = PROC_NULL;
316670eb 2373 uint64_t current_time;
39236c6e
A
2374 boolean_t killed = FALSE;
2375 unsigned int i = 0;
316670eb 2376
39236c6e 2377 /* Pick next idle exit victim. */
316670eb 2378 current_time = mach_absolute_time();
6d2010ae 2379
39236c6e 2380 proc_list_lock();
6d2010ae 2381
39236c6e
A
2382 p = memorystatus_get_first_proc_locked(&i, FALSE);
2383 while (p) {
2384 /* No need to look beyond the idle band */
2385 if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
2386 break;
2387 }
2388
2389 if ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT|P_DIRTY_IS_DIRTY|P_DIRTY_TERMINATED)) == (P_DIRTY_ALLOW_IDLE_EXIT)) {
2390 if (current_time >= p->p_memstat_idledeadline) {
2391 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
2392 victim_p = proc_ref_locked(p);
2393 break;
316670eb
A
2394 }
2395 }
39236c6e
A
2396
2397 p = memorystatus_get_next_proc_locked(&i, p, FALSE);
6d2010ae 2398 }
316670eb 2399
39236c6e
A
2400 proc_list_unlock();
2401
2402 if (victim_p) {
2403 printf("memorystatus_thread: idle exiting pid %d [%s]\n", victim_p->p_pid, (victim_p->p_comm ? victim_p->p_comm : "(unknown)"));
2404 killed = memorystatus_do_kill(victim_p, kMemorystatusKilledIdleExit);
2405 proc_rele(victim_p);
316670eb 2406 }
b0d623f7 2407
39236c6e 2408 return killed;
2d21ac55 2409}
39236c6e 2410#endif
2d21ac55 2411
fe8ab488 2412#if CONFIG_JETSAM
39236c6e
A
2413static void
2414memorystatus_thread_wake(void) {
2415 thread_wakeup((event_t)&memorystatus_wakeup);
b0d623f7 2416}
fe8ab488
A
2417#endif /* CONFIG_JETSAM */
2418
2419extern void vm_pressure_response(void);
b0d623f7 2420
316670eb 2421static int
39236c6e
A
2422memorystatus_thread_block(uint32_t interval_ms, thread_continue_t continuation)
2423{
2424 if (interval_ms) {
2425 assert_wait_timeout(&memorystatus_wakeup, THREAD_UNINT, interval_ms, 1000 * NSEC_PER_USEC);
2426 } else {
2427 assert_wait(&memorystatus_wakeup, THREAD_UNINT);
2428 }
316670eb 2429
39236c6e
A
2430 return thread_block(continuation);
2431}
316670eb 2432
39236c6e
A
2433static void
2434memorystatus_thread(void *param __unused, wait_result_t wr __unused)
2435{
2436 static boolean_t is_vm_privileged = FALSE;
3e170ce0 2437
39236c6e
A
2438#if CONFIG_JETSAM
2439 boolean_t post_snapshot = FALSE;
2440 uint32_t errors = 0;
fe8ab488 2441 uint32_t hwm_kill = 0;
3e170ce0
A
2442 boolean_t sort_flag = TRUE;
2443
2444 /* Jetsam Loop Detection - locals */
2445 memstat_bucket_t *bucket;
2446 int jld_bucket_count = 0;
2447 struct timeval jld_now_tstamp = {0,0};
2448 uint64_t jld_now_msecs = 0;
2449
2450 /* Jetsam Loop Detection - statics */
2451 static uint64_t jld_timestamp_msecs = 0;
2452 static int jld_idle_kill_candidates = 0; /* Number of available processes in band 0,1 at start */
2453 static int jld_idle_kills = 0; /* Number of procs killed during eval period */
2454 static int jld_eval_aggressive_count = 0; /* Bumps the max priority in aggressive loop */
2455 static int32_t jld_priority_band_max = JETSAM_PRIORITY_UI_SUPPORT;
39236c6e 2456#endif
316670eb 2457
39236c6e
A
2458 if (is_vm_privileged == FALSE) {
2459 /*
2460 * It's the first time the thread has run, so just mark the thread as privileged and block.
2461 * This avoids a spurious pass with unset variables, as set out in <rdar://problem/9609402>.
2462 */
2463 thread_wire(host_priv_self(), current_thread(), TRUE);
2464 is_vm_privileged = TRUE;
2465
3e170ce0
A
2466 if (vm_restricted_to_single_processor == TRUE)
2467 thread_vm_bind_group_add();
2468
39236c6e 2469 memorystatus_thread_block(0, memorystatus_thread);
316670eb
A
2470 }
2471
39236c6e
A
2472#if CONFIG_JETSAM
2473
2474 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_START,
3e170ce0 2475 memorystatus_available_pages, memorystatus_jld_enabled, memorystatus_jld_eval_period_msecs, memorystatus_jld_eval_aggressive_count,0);
316670eb 2476
fe8ab488
A
2477 /*
2478 * Jetsam aware version.
2479 *
2480 * The VM pressure notification thread is working it's way through clients in parallel.
39236c6e 2481 *
fe8ab488
A
2482 * So, while the pressure notification thread is targeting processes in order of
2483 * increasing jetsam priority, we can hopefully reduce / stop it's work by killing
2484 * any processes that have exceeded their highwater mark.
39236c6e 2485 *
fe8ab488
A
2486 * If we run out of HWM processes and our available pages drops below the critical threshold, then,
2487 * we target the least recently used process in order of increasing jetsam priority (exception: the FG band).
39236c6e 2488 */
fe8ab488
A
2489 while (is_thrashing(kill_under_pressure_cause) ||
2490 memorystatus_available_pages <= memorystatus_available_pages_pressure) {
39236c6e
A
2491 boolean_t killed;
2492 int32_t priority;
fe8ab488
A
2493 uint32_t cause;
2494
2495 if (kill_under_pressure_cause) {
2496 cause = kill_under_pressure_cause;
2497 } else {
2498 cause = kMemorystatusKilledVMPageShortage;
2499 }
39236c6e
A
2500
2501#if LEGACY_HIWATER
2502 /* Highwater */
2503 killed = memorystatus_kill_hiwat_proc(&errors);
2504 if (killed) {
fe8ab488 2505 hwm_kill++;
39236c6e
A
2506 post_snapshot = TRUE;
2507 goto done;
fe8ab488
A
2508 } else {
2509 memorystatus_hwm_candidates = FALSE;
2510 }
2511
2512 /* No highwater processes to kill. Continue or stop for now? */
2513 if (!is_thrashing(kill_under_pressure_cause) &&
2514 (memorystatus_available_pages > memorystatus_available_pages_critical)) {
2515 /*
2516 * We are _not_ out of pressure but we are above the critical threshold and there's:
2517 * - no compressor thrashing
2518 * - no more HWM processes left.
2519 * For now, don't kill any other processes.
2520 */
2521
2522 if (hwm_kill == 0) {
2523 memorystatus_thread_wasted_wakeup++;
2524 }
2525
2526 break;
39236c6e
A
2527 }
2528#endif
3e170ce0
A
2529 if (memorystatus_jld_enabled == TRUE) {
2530
2531 /*
2532 * Jetsam Loop Detection: attempt to detect
2533 * rapid daemon relaunches in the lower bands.
2534 */
2535
2536 microuptime(&jld_now_tstamp);
2537
2538 /*
2539 * Ignore usecs in this calculation.
2540 * msecs granularity is close enough.
2541 */
2542 jld_now_msecs = (jld_now_tstamp.tv_sec * 1000);
2543
2544 proc_list_lock();
2545 bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
2546 jld_bucket_count = bucket->count;
2547 bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE_DEFERRED];
2548 jld_bucket_count += bucket->count;
2549 proc_list_unlock();
2550
2551 /*
2552 * memorystatus_jld_eval_period_msecs is a tunable
2553 * memorystatus_jld_eval_aggressive_count is a tunable
2554 * memorystatus_jld_eval_aggressive_priority_band_max is a tunable
2555 */
2556 if ( (jld_bucket_count == 0) ||
2557 (jld_now_msecs > (jld_timestamp_msecs + memorystatus_jld_eval_period_msecs))) {
2558
2559 /*
2560 * Refresh evaluation parameters
2561 */
2562 jld_timestamp_msecs = jld_now_msecs;
2563 jld_idle_kill_candidates = jld_bucket_count;
2564 jld_idle_kills = 0;
2565 jld_eval_aggressive_count = 0;
2566 jld_priority_band_max = JETSAM_PRIORITY_UI_SUPPORT;
2567 }
2568
2569 if (jld_idle_kills > jld_idle_kill_candidates) {
2570 jld_eval_aggressive_count++;
2571 if (jld_eval_aggressive_count > memorystatus_jld_eval_aggressive_count) {
2572 /*
2573 * Bump up the jetsam priority limit (eg: the bucket index)
2574 * Enforce bucket index sanity.
2575 */
2576 if ((memorystatus_jld_eval_aggressive_priority_band_max < 0) ||
2577 (memorystatus_jld_eval_aggressive_priority_band_max >= MEMSTAT_BUCKET_COUNT)) {
2578 /*
2579 * Do nothing. Stick with the default level.
2580 */
2581 } else {
2582 jld_priority_band_max = memorystatus_jld_eval_aggressive_priority_band_max;
2583 }
2584 }
2585
2586 killed = memorystatus_kill_top_process_aggressive(
2587 TRUE,
2588 kMemorystatusKilledVMThrashing,
2589 jld_eval_aggressive_count,
2590 jld_priority_band_max,
2591 &errors);
2592
2593
2594 if (killed) {
2595 /* Always generate logs after aggressive kill */
2596 post_snapshot = TRUE;
2597 goto done;
2598 }
2599 }
2600 }
39236c6e
A
2601
2602 /* LRU */
3e170ce0
A
2603 killed = memorystatus_kill_top_process(TRUE, sort_flag, cause, &priority, &errors);
2604 sort_flag = FALSE;
2605
39236c6e 2606 if (killed) {
3e170ce0
A
2607 /*
2608 * Don't generate logs for steady-state idle-exit kills,
2609 * unless it is overridden for debug or by the device
2610 * tree.
2611 */
fe8ab488 2612 if ((priority != JETSAM_PRIORITY_IDLE) || memorystatus_idle_snapshot) {
39236c6e
A
2613 post_snapshot = TRUE;
2614 }
3e170ce0
A
2615
2616 /* Jetsam Loop Detection */
2617 if (memorystatus_jld_enabled == TRUE) {
2618 if ((priority == JETSAM_PRIORITY_IDLE) || (priority == JETSAM_PRIORITY_IDLE_DEFERRED)) {
2619 jld_idle_kills++;
2620 } else {
2621 /*
2622 * We've reached into bands beyond idle deferred.
2623 * We make no attempt to monitor them
2624 */
2625 }
2626 }
39236c6e
A
2627 goto done;
2628 }
fe8ab488
A
2629
2630 if (memorystatus_available_pages <= memorystatus_available_pages_critical) {
2631 /* Under pressure and unable to kill a process - panic */
2632 panic("memorystatus_jetsam_thread: no victim! available pages:%d\n", memorystatus_available_pages);
2633 }
39236c6e
A
2634
2635done:
fe8ab488
A
2636
2637 /*
2638 * We do not want to over-kill when thrashing has been detected.
2639 * To avoid that, we reset the flag here and notify the
2640 * compressor.
39236c6e 2641 */
fe8ab488
A
2642 if (is_thrashing(kill_under_pressure_cause)) {
2643 kill_under_pressure_cause = 0;
2644 vm_thrashing_jetsam_done();
39236c6e 2645 }
39236c6e 2646 }
fe8ab488
A
2647
2648 kill_under_pressure_cause = 0;
2649
39236c6e
A
2650 if (errors) {
2651 memorystatus_clear_errors();
2652 }
2653
2654#if VM_PRESSURE_EVENTS
fe8ab488
A
2655 /*
2656 * LD: We used to target the foreground process first and foremost here.
2657 * Now, we target all processes, starting from the non-suspended, background
2658 * processes first. We will target foreground too.
2659 *
2660 * memorystatus_update_vm_pressure(TRUE);
2661 */
2662 //vm_pressure_response();
39236c6e
A
2663#endif
2664
2665 if (post_snapshot) {
2666 size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
2667 sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count);
3e170ce0
A
2668 uint64_t timestamp_now = mach_absolute_time();
2669 memorystatus_jetsam_snapshot->notification_time = timestamp_now;
2670 if (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
2671 timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout) {
2672 int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
2673 if (!ret) {
2674 proc_list_lock();
2675 memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
2676 proc_list_unlock();
2677 }
2678 }
39236c6e 2679 }
3e170ce0 2680
39236c6e
A
2681 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_END,
2682 memorystatus_available_pages, 0, 0, 0, 0);
2683
2684#else /* CONFIG_JETSAM */
2685
fe8ab488
A
2686 /*
2687 * Jetsam not enabled
39236c6e
A
2688 */
2689
39236c6e
A
2690#endif /* CONFIG_JETSAM */
2691
2692 memorystatus_thread_block(0, memorystatus_thread);
2693}
2694
2695#if !CONFIG_JETSAM
fe8ab488
A
2696/*
2697 * Returns TRUE:
2698 * when an idle-exitable proc was killed
2699 * Returns FALSE:
2700 * when there are no more idle-exitable procs found
2701 * when the attempt to kill an idle-exitable proc failed
2702 */
39236c6e 2703boolean_t memorystatus_idle_exit_from_VM(void) {
fe8ab488 2704 return(kill_idle_exit_proc());
39236c6e 2705}
fe8ab488 2706#endif /* !CONFIG_JETSAM */
39236c6e
A
2707
2708#if CONFIG_JETSAM
2709
2710/*
2711 * Callback invoked when allowable physical memory footprint exceeded
2712 * (dirty pages + IOKit mappings)
2713 *
2714 * This is invoked for both advisory, non-fatal per-task high watermarks,
fe8ab488 2715 * as well as the fatal task memory limits.
39236c6e
A
2716 */
2717void
2718memorystatus_on_ledger_footprint_exceeded(boolean_t warning, const int max_footprint_mb)
2719{
3e170ce0
A
2720 boolean_t is_active;
2721 boolean_t is_fatal;
2722
39236c6e 2723 proc_t p = current_proc();
fe8ab488 2724
3e170ce0
A
2725 proc_list_lock();
2726
2727 is_active = proc_jetsam_state_is_active_locked(p);
2728 is_fatal = (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT);
2729
2730 if (warning == FALSE) {
2731 /*
2732 * We only want the EXC_RESOURCE to trigger once per lifetime
2733 * of the active/inactive limit state. So, here, we detect the
2734 * active/inactive state of the process and mark the
2735 * state as exception has been triggered.
2736 */
2737 if (is_active == TRUE) {
2738 /*
2739 * turn off exceptions for active state
2740 */
2741 p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_ACTIVE_EXC_TRIGGERED;
2742 } else {
2743 /*
2744 * turn off exceptions for inactive state
2745 */
2746 p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_INACTIVE_EXC_TRIGGERED;
2747 }
2748
2749 /*
2750 * Soft memory limit is a non-fatal high-water-mark
2751 * Hard memory limit is a fatal custom-task-limit or system-wide per-task memory limit.
2752 */
2753 printf("process %d (%s) exceeded physical memory footprint, the %s%sMemoryLimit of %d MB\n",
2754 p->p_pid, p->p_comm, (is_active ? "Active" : "Inactive"),
2755 (is_fatal ? "Hard" : "Soft"), max_footprint_mb);
2756
fe8ab488 2757 }
39236c6e 2758
3e170ce0
A
2759 proc_list_unlock();
2760
39236c6e
A
2761#if VM_PRESSURE_EVENTS
2762 if (warning == TRUE) {
fe8ab488 2763 if (memorystatus_warn_process(p->p_pid, TRUE /* critical? */) != TRUE) {
39236c6e 2764 /* Print warning, since it's possible that task has not registered for pressure notifications */
3e170ce0 2765 printf("task_exceeded_footprint: failed to warn the current task (exiting, or no handler registered?).\n");
39236c6e
A
2766 }
2767 return;
2768 }
2769#endif /* VM_PRESSURE_EVENTS */
2770
3e170ce0 2771 if (is_fatal) {
39236c6e 2772 /*
fe8ab488
A
2773 * If this process has no high watermark or has a fatal task limit, then we have been invoked because the task
2774 * has violated either the system-wide per-task memory limit OR its own task limit.
39236c6e
A
2775 */
2776 if (memorystatus_kill_process_sync(p->p_pid, kMemorystatusKilledPerProcessLimit) != TRUE) {
2777 printf("task_exceeded_footprint: failed to kill the current task (exiting?).\n");
2778 }
fe8ab488
A
2779 } else {
2780 /*
2781 * HWM offender exists. Done without locks or synchronization.
2782 * See comment near its declaration for more details.
2783 */
2784 memorystatus_hwm_candidates = TRUE;
2785 }
2786}
2787
3e170ce0
A
2788/*
2789 * Toggle the P_MEMSTAT_TERMINATED state.
2790 * Takes the proc_list_lock.
2791 */
2792void
2793proc_memstat_terminated(proc_t p, boolean_t set)
2794{
2795#if DEVELOPMENT || DEBUG
2796 if (p) {
2797 proc_list_lock();
2798 if (set == TRUE) {
2799 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
2800 } else {
2801 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
2802 }
2803 proc_list_unlock();
2804 }
2805#else
2806#pragma unused(p, set)
2807 /*
2808 * do nothing
2809 */
2810#endif /* DEVELOPMENT || DEBUG */
2811 return;
2812}
2813
fe8ab488
A
2814/*
2815 * This is invoked when cpulimits have been exceeded while in fatal mode.
2816 * The jetsam_flags do not apply as those are for memory related kills.
2817 * We call this routine so that the offending process is killed with
2818 * a non-zero exit status.
2819 */
2820void
2821jetsam_on_ledger_cpulimit_exceeded(void)
2822{
2823 int retval = 0;
2824 int jetsam_flags = 0; /* make it obvious */
2825 proc_t p = current_proc();
2826
2827 printf("task_exceeded_cpulimit: killing pid %d [%s]\n",
2828 p->p_pid, (p->p_comm ? p->p_comm : "(unknown)"));
2829
2830 retval = jetsam_do_kill(p, jetsam_flags);
2831
2832 if (retval) {
2833 printf("task_exceeded_cpulimit: failed to kill current task (exiting?).\n");
39236c6e
A
2834 }
2835}
2836
2837static void
fe8ab488 2838memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages)
39236c6e
A
2839{
2840 assert(task);
2841 assert(footprint);
2842
2843 *footprint = (uint32_t)(get_task_phys_footprint(task) / PAGE_SIZE_64);
2844 if (max_footprint) {
2845 *max_footprint = (uint32_t)(get_task_phys_footprint_max(task) / PAGE_SIZE_64);
2846 }
fe8ab488
A
2847 if (max_footprint_lifetime) {
2848 *max_footprint_lifetime = (uint32_t)(get_task_resident_max(task) / PAGE_SIZE_64);
2849 }
2850 if (purgeable_pages) {
2851 *purgeable_pages = (uint32_t)(get_task_purgeable_size(task) / PAGE_SIZE_64);
39236c6e 2852 }
39236c6e
A
2853}
2854
2855static void
3e170ce0 2856memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause)
39236c6e
A
2857{
2858 unsigned int i;
2859
2860 for (i = 0; i < memorystatus_jetsam_snapshot_count; i++) {
2861 if (memorystatus_jetsam_snapshot_list[i].pid == p->p_pid) {
2862 /* Update if the priority has changed since the snapshot was taken */
2863 if (memorystatus_jetsam_snapshot_list[i].priority != p->p_memstat_effectivepriority) {
2864 memorystatus_jetsam_snapshot_list[i].priority = p->p_memstat_effectivepriority;
2865 strlcpy(memorystatus_jetsam_snapshot_list[i].name, p->p_comm, MAXCOMLEN+1);
2866 memorystatus_jetsam_snapshot_list[i].state = memorystatus_build_state(p);
2867 memorystatus_jetsam_snapshot_list[i].user_data = p->p_memstat_userdata;
2868 memorystatus_jetsam_snapshot_list[i].fds = p->p_fd->fd_nfiles;
2869 }
2870 memorystatus_jetsam_snapshot_list[i].killed = kill_cause;
2871 return;
2872 }
2873 }
316670eb 2874}
b0d623f7 2875
39236c6e
A
2876void memorystatus_pages_update(unsigned int pages_avail)
2877{
fe8ab488
A
2878 memorystatus_available_pages = pages_avail;
2879
2880#if VM_PRESSURE_EVENTS
2881 /*
2882 * Since memorystatus_available_pages changes, we should
2883 * re-evaluate the pressure levels on the system and
2884 * check if we need to wake the pressure thread.
2885 * We also update memorystatus_level in that routine.
2886 */
2887 vm_pressure_response();
2888
2889 if (memorystatus_available_pages <= memorystatus_available_pages_pressure) {
2890
2891 if (memorystatus_hwm_candidates || (memorystatus_available_pages <= memorystatus_available_pages_critical)) {
2892 memorystatus_thread_wake();
2893 }
2894 }
2895#else /* VM_PRESSURE_EVENTS */
2896
39236c6e
A
2897 boolean_t critical, delta;
2898
316670eb
A
2899 if (!memorystatus_delta) {
2900 return;
2901 }
39236c6e
A
2902
2903 critical = (pages_avail < memorystatus_available_pages_critical) ? TRUE : FALSE;
2904 delta = ((pages_avail >= (memorystatus_available_pages + memorystatus_delta))
2905 || (memorystatus_available_pages >= (pages_avail + memorystatus_delta))) ? TRUE : FALSE;
2906
2907 if (critical || delta) {
39236c6e 2908 memorystatus_level = memorystatus_available_pages * 100 / atop_64(max_mem);
39236c6e 2909 memorystatus_thread_wake();
b0d623f7 2910 }
fe8ab488 2911#endif /* VM_PRESSURE_EVENTS */
316670eb
A
2912}
2913
2914static boolean_t
3e170ce0 2915memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry)
316670eb 2916{
fe8ab488
A
2917 clock_sec_t tv_sec;
2918 clock_usec_t tv_usec;
2919
39236c6e 2920 memset(entry, 0, sizeof(memorystatus_jetsam_snapshot_entry_t));
316670eb
A
2921
2922 entry->pid = p->p_pid;
2923 strlcpy(&entry->name[0], p->p_comm, MAXCOMLEN+1);
39236c6e 2924 entry->priority = p->p_memstat_effectivepriority;
fe8ab488 2925 memorystatus_get_task_page_counts(p->task, &entry->pages, &entry->max_pages, &entry->max_pages_lifetime, &entry->purgeable_pages);
39236c6e
A
2926 entry->state = memorystatus_build_state(p);
2927 entry->user_data = p->p_memstat_userdata;
316670eb 2928 memcpy(&entry->uuid[0], &p->p_uuid[0], sizeof(p->p_uuid));
fe8ab488
A
2929 entry->fds = p->p_fd->fd_nfiles;
2930
2931 absolutetime_to_microtime(get_task_cpu_time(p->task), &tv_sec, &tv_usec);
2932 entry->cpu_time.tv_sec = tv_sec;
2933 entry->cpu_time.tv_usec = tv_usec;
316670eb
A
2934
2935 return TRUE;
b0d623f7
A
2936}
2937
2938static void
3e170ce0 2939memorystatus_init_snapshot_vmstats(memorystatus_jetsam_snapshot_t *snapshot)
b0d623f7 2940{
39236c6e 2941 kern_return_t kr = KERN_SUCCESS;
39236c6e
A
2942 mach_msg_type_number_t count = HOST_VM_INFO64_COUNT;
2943 vm_statistics64_data_t vm_stat;
2944
2945 if ((kr = host_statistics64(host_self(), HOST_VM_INFO64, (host_info64_t)&vm_stat, &count) != KERN_SUCCESS)) {
3e170ce0
A
2946 printf("memorystatus_init_jetsam_snapshot_stats: host_statistics64 failed with %d\n", kr);
2947 memset(&snapshot->stats, 0, sizeof(snapshot->stats));
39236c6e 2948 } else {
3e170ce0
A
2949 snapshot->stats.free_pages = vm_stat.free_count;
2950 snapshot->stats.active_pages = vm_stat.active_count;
2951 snapshot->stats.inactive_pages = vm_stat.inactive_count;
2952 snapshot->stats.throttled_pages = vm_stat.throttled_count;
2953 snapshot->stats.purgeable_pages = vm_stat.purgeable_count;
2954 snapshot->stats.wired_pages = vm_stat.wire_count;
2955
2956 snapshot->stats.speculative_pages = vm_stat.speculative_count;
2957 snapshot->stats.filebacked_pages = vm_stat.external_page_count;
2958 snapshot->stats.anonymous_pages = vm_stat.internal_page_count;
2959 snapshot->stats.compressions = vm_stat.compressions;
2960 snapshot->stats.decompressions = vm_stat.decompressions;
2961 snapshot->stats.compressor_pages = vm_stat.compressor_page_count;
2962 snapshot->stats.total_uncompressed_pages_in_compressor = vm_stat.total_uncompressed_pages_in_compressor;
2963 }
2964}
2965
2966/*
2967 * Collect vm statistics at boot.
2968 * Called only once (see kern_exec.c)
2969 * Data can be consumed at any time.
2970 */
2971void
2972memorystatus_init_at_boot_snapshot() {
2973 memorystatus_init_snapshot_vmstats(&memorystatus_at_boot_snapshot);
2974 memorystatus_at_boot_snapshot.entry_count = 0;
2975 memorystatus_at_boot_snapshot.notification_time = 0; /* updated when consumed */
2976 memorystatus_at_boot_snapshot.snapshot_time = mach_absolute_time();
2977}
2978
2979static void
2980memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count )
2981{
2982 proc_t p, next_p;
2983 unsigned int b = 0, i = 0;
2984
2985 memorystatus_jetsam_snapshot_t *snapshot = NULL;
2986 memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
2987 unsigned int snapshot_max = 0;
2988
2989 if (od_snapshot) {
2990 /*
2991 * This is an on_demand snapshot
2992 */
2993 snapshot = od_snapshot;
2994 snapshot_list = od_snapshot->entries;
2995 snapshot_max = ods_list_count;
2996 } else {
2997 /*
2998 * This is a jetsam event snapshot
2999 */
3000 snapshot = memorystatus_jetsam_snapshot;
3001 snapshot_list = memorystatus_jetsam_snapshot->entries;
3002 snapshot_max = memorystatus_jetsam_snapshot_max;
39236c6e
A
3003 }
3004
3e170ce0
A
3005 memorystatus_init_snapshot_vmstats(snapshot);
3006
39236c6e
A
3007 next_p = memorystatus_get_first_proc_locked(&b, TRUE);
3008 while (next_p) {
3009 p = next_p;
3010 next_p = memorystatus_get_next_proc_locked(&b, p, TRUE);
3011
3e170ce0 3012 if (FALSE == memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i])) {
316670eb
A
3013 continue;
3014 }
3015
3e170ce0 3016 MEMORYSTATUS_DEBUG(0, "jetsam snapshot pid %d, uuid = %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
b0d623f7
A
3017 p->p_pid,
3018 p->p_uuid[0], p->p_uuid[1], p->p_uuid[2], p->p_uuid[3], p->p_uuid[4], p->p_uuid[5], p->p_uuid[6], p->p_uuid[7],
3019 p->p_uuid[8], p->p_uuid[9], p->p_uuid[10], p->p_uuid[11], p->p_uuid[12], p->p_uuid[13], p->p_uuid[14], p->p_uuid[15]);
316670eb 3020
3e170ce0 3021 if (++i == snapshot_max) {
b0d623f7
A
3022 break;
3023 }
3024 }
39236c6e 3025
3e170ce0
A
3026 snapshot->snapshot_time = mach_absolute_time();
3027 snapshot->entry_count = i;
3028
3029 if (!od_snapshot) {
3030 /* update the system buffer count */
3031 memorystatus_jetsam_snapshot_count = i;
3032 }
b0d623f7
A
3033}
3034
39236c6e 3035#if DEVELOPMENT || DEBUG
b0d623f7 3036
39236c6e
A
3037static int
3038memorystatus_cmd_set_panic_bits(user_addr_t buffer, uint32_t buffer_size) {
3039 int ret;
3040 memorystatus_jetsam_panic_options_t debug;
3041
3042 if (buffer_size != sizeof(memorystatus_jetsam_panic_options_t)) {
3043 return EINVAL;
b0d623f7 3044 }
39236c6e
A
3045
3046 ret = copyin(buffer, &debug, buffer_size);
3047 if (ret) {
3048 return ret;
3049 }
3050
3051 /* Panic bits match kMemorystatusKilled* enum */
3052 memorystatus_jetsam_panic_debug = (memorystatus_jetsam_panic_debug & ~debug.mask) | (debug.data & debug.mask);
3053
3054 /* Copyout new value */
3055 debug.data = memorystatus_jetsam_panic_debug;
3056 ret = copyout(&debug, buffer, sizeof(memorystatus_jetsam_panic_options_t));
3057
3058 return ret;
b0d623f7
A
3059}
3060
3e170ce0
A
3061/*
3062 * Triggers a sort_order on a specified jetsam priority band.
3063 * This is for testing only, used to force a path through the sort
3064 * function.
3065 */
3066static int
3067memorystatus_cmd_test_jetsam_sort(int priority, int sort_order) {
3068
3069 int error = 0;
3070
3071 unsigned int bucket_index = 0;
3072
3073 if (priority == -1) {
3074 /* Use as shorthand for default priority */
3075 bucket_index = JETSAM_PRIORITY_DEFAULT;
3076 } else {
3077 bucket_index = (unsigned int)priority;
3078 }
3079
3080 error = memorystatus_sort_bucket(bucket_index, sort_order);
3081
3082 return (error);
3083}
3084
39236c6e
A
3085#endif
3086
3087/*
3088 * Jetsam a specific process.
3089 */
3090static boolean_t
3091memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause) {
3092 boolean_t killed;
b0d623f7 3093 proc_t p;
39236c6e
A
3094
3095 /* TODO - add a victim queue and push this into the main jetsam thread */
3096
3097 p = proc_find(victim_pid);
3098 if (!p) {
3099 return FALSE;
3100 }
3101
3e170ce0 3102 printf("memorystatus: specifically killing pid %d [%s] (%s %d) - memorystatus_available_pages: %d\n",
fe8ab488 3103 victim_pid, (p->p_comm ? p->p_comm : "(unknown)"),
3e170ce0 3104 jetsam_kill_cause_name[cause], p->p_memstat_effectivepriority, memorystatus_available_pages);
39236c6e
A
3105
3106 proc_list_lock();
3107
3108 if (memorystatus_jetsam_snapshot_count == 0) {
3e170ce0 3109 memorystatus_init_jetsam_snapshot_locked(NULL,0);
39236c6e
A
3110 }
3111
3e170ce0 3112 memorystatus_update_jetsam_snapshot_entry_locked(p, cause);
39236c6e
A
3113 proc_list_unlock();
3114
3115 killed = memorystatus_do_kill(p, cause);
3116 proc_rele(p);
3117
3118 return killed;
3119}
3120
3121/*
3122 * Jetsam the first process in the queue.
3123 */
3124static boolean_t
3e170ce0 3125memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, int32_t *priority, uint32_t *errors)
39236c6e
A
3126{
3127 pid_t aPid;
3128 proc_t p = PROC_NULL, next_p = PROC_NULL;
3129 boolean_t new_snapshot = FALSE, killed = FALSE;
3e170ce0 3130 int kill_count = 0;
39236c6e 3131 unsigned int i = 0;
3e170ce0 3132 uint32_t aPid_ep;
b0d623f7 3133
6d2010ae
A
3134#ifndef CONFIG_FREEZE
3135#pragma unused(any)
3136#endif
316670eb 3137
39236c6e
A
3138 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
3139 memorystatus_available_pages, 0, 0, 0, 0);
6d2010ae 3140
316670eb 3141
3e170ce0
A
3142 if (sort_flag == TRUE) {
3143 (void)memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT);
3144 }
3145
3146 proc_list_lock();
fe8ab488 3147
39236c6e
A
3148 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
3149 while (next_p) {
316670eb
A
3150#if DEVELOPMENT || DEBUG
3151 int activeProcess;
3152 int procSuspendedForDiagnosis;
3153#endif /* DEVELOPMENT || DEBUG */
39236c6e
A
3154
3155 p = next_p;
3156 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
3157
6d2010ae 3158#if DEVELOPMENT || DEBUG
39236c6e
A
3159 activeProcess = p->p_memstat_state & P_MEMSTAT_FOREGROUND;
3160 procSuspendedForDiagnosis = p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED;
6d2010ae 3161#endif /* DEVELOPMENT || DEBUG */
316670eb 3162
39236c6e 3163 aPid = p->p_pid;
3e170ce0 3164 aPid_ep = p->p_memstat_effectivepriority;
316670eb 3165
39236c6e
A
3166 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
3167 continue;
b0d623f7 3168 }
39236c6e 3169
6d2010ae 3170#if DEVELOPMENT || DEBUG
39236c6e
A
3171 if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && procSuspendedForDiagnosis) {
3172 printf("jetsam: continuing after ignoring proc suspended already for diagnosis - %d\n", aPid);
3173 continue;
3174 }
6d2010ae 3175#endif /* DEVELOPMENT || DEBUG */
316670eb 3176
fe8ab488
A
3177 if (cause == kMemorystatusKilledVnodes)
3178 {
3179 /*
3180 * If the system runs out of vnodes, we systematically jetsam
3181 * processes in hopes of stumbling onto a vnode gain that helps
3182 * the system recover. The process that happens to trigger
3183 * this path has no known relationship to the vnode consumption.
3184 * We attempt to safeguard that process e.g: do not jetsam it.
3185 */
3186
3187 if (p == current_proc()) {
3188 /* do not jetsam the current process */
3189 continue;
3190 }
3191 }
3192
6d2010ae 3193#if CONFIG_FREEZE
39236c6e
A
3194 boolean_t skip;
3195 boolean_t reclaim_proc = !(p->p_memstat_state & (P_MEMSTAT_LOCKED | P_MEMSTAT_NORECLAIM));
3196 if (any || reclaim_proc) {
3197 skip = FALSE;
3198 } else {
3199 skip = TRUE;
3200 }
316670eb 3201
39236c6e
A
3202 if (skip) {
3203 continue;
3204 } else
6d2010ae 3205#endif
39236c6e 3206 {
39236c6e
A
3207 /*
3208 * Capture a snapshot if none exists and:
3209 * - priority was not requested (this is something other than an ambient kill)
3210 * - the priority was requested *and* the targeted process is not at idle priority
3211 */
3212 if ((memorystatus_jetsam_snapshot_count == 0) &&
fe8ab488 3213 (memorystatus_idle_snapshot || ((!priority) || (priority && (*priority != JETSAM_PRIORITY_IDLE))))) {
3e170ce0 3214 memorystatus_init_jetsam_snapshot_locked(NULL,0);
39236c6e
A
3215 new_snapshot = TRUE;
3216 }
3217
3218 /*
3219 * Mark as terminated so that if exit1() indicates success, but the process (for example)
3220 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
3221 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
3222 * acquisition of the proc lock.
3223 */
3224 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
3225
6d2010ae 3226#if DEVELOPMENT || DEBUG
39236c6e
A
3227 if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && activeProcess) {
3228 MEMORYSTATUS_DEBUG(1, "jetsam: suspending pid %d [%s] (active) for diagnosis - memory_status_level: %d\n",
3229 aPid, (p->p_comm ? p->p_comm: "(unknown)"), memorystatus_level);
3e170ce0 3230 memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusKilledDiagnostic);
39236c6e
A
3231 p->p_memstat_state |= P_MEMSTAT_DIAG_SUSPENDED;
3232 if (memorystatus_jetsam_policy & kPolicyDiagnoseFirst) {
3233 jetsam_diagnostic_suspended_one_active_proc = 1;
3234 printf("jetsam: returning after suspending first active proc - %d\n", aPid);
3235 }
3236
3237 p = proc_ref_locked(p);
3238 proc_list_unlock();
3239 if (p) {
316670eb 3240 task_suspend(p->task);
3e170ce0
A
3241 if (priority) {
3242 *priority = aPid_ep;
3243 }
316670eb 3244 proc_rele(p);
39236c6e
A
3245 killed = TRUE;
3246 }
3247
3248 goto exit;
3249 } else
6d2010ae 3250#endif /* DEVELOPMENT || DEBUG */
39236c6e
A
3251 {
3252 /* Shift queue, update stats */
3e170ce0
A
3253 memorystatus_update_jetsam_snapshot_entry_locked(p, cause);
3254
3255 if (proc_ref_locked(p) == p) {
3256 proc_list_unlock();
3257 printf("memorystatus: %s %d [%s] (%s %d) - memorystatus_available_pages: %d\n",
3258 ((aPid_ep == JETSAM_PRIORITY_IDLE) ?
fe8ab488
A
3259 "idle exiting pid" : "jetsam killing pid"),
3260 aPid, (p->p_comm ? p->p_comm : "(unknown)"),
3e170ce0
A
3261 jetsam_kill_cause_name[cause], aPid_ep, memorystatus_available_pages);
3262
39236c6e 3263 killed = memorystatus_do_kill(p, cause);
3e170ce0
A
3264
3265 /* Success? */
3266 if (killed) {
3267 if (priority) {
3268 *priority = aPid_ep;
3269 }
3270 proc_rele(p);
3271 kill_count++;
3272 goto exit;
3273 }
39236c6e 3274
3e170ce0
A
3275 /*
3276 * Failure - first unwind the state,
3277 * then fall through to restart the search.
3278 */
3279 proc_list_lock();
3280 proc_rele_locked(p);
3281 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
3282 p->p_memstat_state |= P_MEMSTAT_ERROR;
3283 *errors += 1;
6d2010ae 3284 }
39236c6e 3285
3e170ce0
A
3286 /*
3287 * Failure - restart the search.
3288 *
3289 * We might have raced with "p" exiting on another core, resulting in no
3290 * ref on "p". Or, we may have failed to kill "p".
3291 *
3292 * Either way, we fall thru to here, leaving the proc in the
3293 * P_MEMSTAT_TERMINATED state.
3294 *
3295 * And, we hold the the proc_list_lock at this point.
3296 */
3297
39236c6e
A
3298 i = 0;
3299 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6d2010ae 3300 }
b0d623f7 3301 }
b0d623f7 3302 }
316670eb 3303
39236c6e 3304 proc_list_unlock();
316670eb 3305
39236c6e
A
3306exit:
3307 /* Clear snapshot if freshly captured and no target was found */
3308 if (new_snapshot && !killed) {
3309 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
316670eb
A
3310 }
3311
39236c6e 3312 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
3e170ce0 3313 memorystatus_available_pages, killed ? aPid : 0, kill_count, 0, 0);
b0d623f7 3314
39236c6e 3315 return killed;
316670eb
A
3316}
3317
3e170ce0
A
3318/*
3319 * Jetsam aggressively
3320 */
39236c6e 3321static boolean_t
3e170ce0
A
3322memorystatus_kill_top_process_aggressive(boolean_t any, uint32_t cause, int aggr_count, int32_t priority_max,
3323 uint32_t *errors)
d1ecb069 3324{
3e170ce0 3325 pid_t aPid;
39236c6e
A
3326 proc_t p = PROC_NULL, next_p = PROC_NULL;
3327 boolean_t new_snapshot = FALSE, killed = FALSE;
3e170ce0 3328 int kill_count = 0;
39236c6e 3329 unsigned int i = 0;
3e170ce0
A
3330 int32_t aPid_ep = 0;
3331
3332#pragma unused(any)
3333
3334 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
3335 memorystatus_available_pages, priority_max, 0, 0, 0);
3336
39236c6e 3337 proc_list_lock();
3e170ce0 3338
39236c6e
A
3339 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
3340 while (next_p) {
3e170ce0
A
3341#if DEVELOPMENT || DEBUG
3342 int activeProcess;
3343 int procSuspendedForDiagnosis;
3344#endif /* DEVELOPMENT || DEBUG */
39236c6e 3345
3e170ce0
A
3346 if ((unsigned int)(next_p->p_memstat_effectivepriority) != i) {
3347
3348 /*
3349 * We have raced with next_p running on another core, as it has
3350 * moved to a different jetsam priority band. This means we have
3351 * lost our place in line while traversing the jetsam list. We
3352 * attempt to recover by rewinding to the beginning of the band
3353 * we were already traversing. By doing this, we do not guarantee
3354 * that no process escapes this aggressive march, but we can make
3355 * skipping an entire range of processes less likely. (PR-21069019)
3356 */
3357
3358 MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: rewinding %s moved from band %d --> %d\n",
3359 aggr_count, next_p->p_comm, i, next_p->p_memstat_effectivepriority);
3360
3361 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
3362 continue;
3363 }
3364
3365 p = next_p;
3366 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
3367
3368 if (p->p_memstat_effectivepriority > priority_max) {
3369 /*
3370 * Bail out of this killing spree if we have
3371 * reached beyond the priority_max jetsam band.
3372 * That is, we kill up to and through the
3373 * priority_max jetsam band.
3374 */
3375 proc_list_unlock();
3376 goto exit;
3377 }
3378
3379#if DEVELOPMENT || DEBUG
3380 activeProcess = p->p_memstat_state & P_MEMSTAT_FOREGROUND;
3381 procSuspendedForDiagnosis = p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED;
3382#endif /* DEVELOPMENT || DEBUG */
3383
3384 aPid = p->p_pid;
3385 aPid_ep = p->p_memstat_effectivepriority;
3386
3387 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
3388 continue;
3389 }
3390
3391#if DEVELOPMENT || DEBUG
3392 if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && procSuspendedForDiagnosis) {
3393 printf("jetsam: continuing after ignoring proc suspended already for diagnosis - %d\n", aPid);
3394 continue;
3395 }
3396#endif /* DEVELOPMENT || DEBUG */
3397
3398 /*
3399 * Capture a snapshot if none exists.
3400 */
3401 if (memorystatus_jetsam_snapshot_count == 0) {
3402 memorystatus_init_jetsam_snapshot_locked(NULL,0);
3403 new_snapshot = TRUE;
3404 }
3405
3406 /*
3407 * Mark as terminated so that if exit1() indicates success, but the process (for example)
3408 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
3409 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
3410 * acquisition of the proc lock.
3411 */
3412 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
3413
3414 /* Shift queue, update stats */
3415 memorystatus_update_jetsam_snapshot_entry_locked(p, cause);
3416
3417 /*
3418 * In order to kill the target process, we will drop the proc_list_lock.
3419 * To guaranteee that p and next_p don't disappear out from under the lock,
3420 * we must take a ref on both.
3421 * If we cannot get a reference, then it's likely we've raced with
3422 * that process exiting on another core.
3423 */
3424 if (proc_ref_locked(p) == p) {
3425 if (next_p) {
3426 while (next_p && (proc_ref_locked(next_p) != next_p)) {
3427 proc_t temp_p;
3428
3429 /*
3430 * We must have raced with next_p exiting on another core.
3431 * Recover by getting the next eligible process in the band.
3432 */
3433
3434 MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: skipping %d [%s] (exiting?)\n",
3435 aggr_count, next_p->p_pid, (next_p->p_comm ? next_p->p_comm : "(unknown)"));
3436
3437 temp_p = next_p;
3438 next_p = memorystatus_get_next_proc_locked(&i, temp_p, TRUE);
3439 }
3440 }
3441 proc_list_unlock();
3442
3443 printf("memorystatus: aggressive%d: %s %d [%s] (%s %d) - memorystatus_available_pages: %d\n",
3444 aggr_count,
3445 ((aPid_ep == JETSAM_PRIORITY_IDLE) ? "idle exiting pid" : "jetsam killing pid"),
3446 aPid, (p->p_comm ? p->p_comm : "(unknown)"),
3447 jetsam_kill_cause_name[cause], aPid_ep, memorystatus_available_pages);
3448
3449 killed = memorystatus_do_kill(p, cause);
3450
3451 /* Success? */
3452 if (killed) {
3453 proc_rele(p);
3454 kill_count++;
3455 p = NULL;
3456 killed = FALSE;
3457
3458 /*
3459 * Continue the killing spree.
3460 */
3461 proc_list_lock();
3462 if (next_p) {
3463 proc_rele_locked(next_p);
3464 }
3465 continue;
3466 }
3467
3468 /*
3469 * Failure - first unwind the state,
3470 * then fall through to restart the search.
3471 */
3472 proc_list_lock();
3473 proc_rele_locked(p);
3474 if (next_p) {
3475 proc_rele_locked(next_p);
3476 }
3477 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
3478 p->p_memstat_state |= P_MEMSTAT_ERROR;
3479 *errors += 1;
3480 p = NULL;
3481 }
3482
3483 /*
3484 * Failure - restart the search at the beginning of
3485 * the band we were already traversing.
3486 *
3487 * We might have raced with "p" exiting on another core, resulting in no
3488 * ref on "p". Or, we may have failed to kill "p".
3489 *
3490 * Either way, we fall thru to here, leaving the proc in the
3491 * P_MEMSTAT_TERMINATED or P_MEMSTAT_ERROR state.
3492 *
3493 * And, we hold the the proc_list_lock at this point.
3494 */
3495
3496 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
3497 }
3498
3499 proc_list_unlock();
3500
3501exit:
3502 /* Clear snapshot if freshly captured and no target was found */
3503 if (new_snapshot && (kill_count == 0)) {
3504 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
3505 }
3506
3507 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
3508 memorystatus_available_pages, killed ? aPid : 0, kill_count, 0, 0);
3509
3510 if (kill_count > 0) {
3511 return(TRUE);
3512 }
3513 else {
3514 return(FALSE);
3515 }
3516}
3517
3518#if LEGACY_HIWATER
3519
3520static boolean_t
3521memorystatus_kill_hiwat_proc(uint32_t *errors)
3522{
3523 pid_t aPid = 0;
3524 proc_t p = PROC_NULL, next_p = PROC_NULL;
3525 boolean_t new_snapshot = FALSE, killed = FALSE;
3526 int kill_count = 0;
3527 unsigned int i = 0;
3528 uint32_t aPid_ep;
3529
3530 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_START,
3531 memorystatus_available_pages, 0, 0, 0, 0);
3532
3533 proc_list_lock();
3534
3535 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
3536 while (next_p) {
3537 uint32_t footprint;
3538 boolean_t skip;
3539
3540 p = next_p;
3541 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
3542
39236c6e 3543 aPid = p->p_pid;
3e170ce0 3544 aPid_ep = p->p_memstat_effectivepriority;
316670eb 3545
39236c6e
A
3546 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
3547 continue;
3548 }
3549
3550 /* skip if no limit set */
3551 if (p->p_memstat_memlimit <= 0) {
3552 continue;
d1ecb069 3553 }
3e170ce0
A
3554
3555#if 0
3556 /*
3557 * No need to consider P_MEMSTAT_MEMLIMIT_BACKGROUND anymore.
3558 * Background limits are described via the inactive limit slots.
3559 * Their fatal/non-fatal setting will drive whether or not to be
3560 * considered in this kill path.
3561 */
3562
39236c6e
A
3563 /* skip if a currently inapplicable limit is encountered */
3564 if ((p->p_memstat_state & P_MEMSTAT_MEMLIMIT_BACKGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) {
3565 continue;
3566 }
3e170ce0 3567#endif
39236c6e
A
3568
3569 footprint = (uint32_t)(get_task_phys_footprint(p->task) / (1024 * 1024));
3570 skip = (((int32_t)footprint) <= p->p_memstat_memlimit);
3e170ce0 3571
6d2010ae 3572#if DEVELOPMENT || DEBUG
39236c6e
A
3573 if (!skip && (memorystatus_jetsam_policy & kPolicyDiagnoseActive)) {
3574 if (p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED) {
3575 continue;
6d2010ae 3576 }
39236c6e 3577 }
6d2010ae 3578#endif /* DEVELOPMENT || DEBUG */
316670eb 3579
6d2010ae 3580#if CONFIG_FREEZE
39236c6e
A
3581 if (!skip) {
3582 if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
3583 skip = TRUE;
3584 } else {
3585 skip = FALSE;
3586 }
3587 }
6d2010ae 3588#endif
316670eb 3589
39236c6e
A
3590 if (skip) {
3591 continue;
3592 } else {
fe8ab488
A
3593 MEMORYSTATUS_DEBUG(1, "jetsam: %s pid %d [%s] - %d Mb > 1 (%d Mb)\n",
3594 (memorystatus_jetsam_policy & kPolicyDiagnoseActive) ? "suspending": "killing", aPid, p->p_comm, footprint, p->p_memstat_memlimit);
39236c6e
A
3595
3596 if (memorystatus_jetsam_snapshot_count == 0) {
3e170ce0 3597 memorystatus_init_jetsam_snapshot_locked(NULL,0);
39236c6e
A
3598 new_snapshot = TRUE;
3599 }
3600
3601 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
3602
6d2010ae 3603#if DEVELOPMENT || DEBUG
39236c6e
A
3604 if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
3605 MEMORYSTATUS_DEBUG(1, "jetsam: pid %d suspended for diagnosis - memorystatus_available_pages: %d\n", aPid, memorystatus_available_pages);
3e170ce0 3606 memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusKilledDiagnostic);
39236c6e
A
3607 p->p_memstat_state |= P_MEMSTAT_DIAG_SUSPENDED;
3608
3609 p = proc_ref_locked(p);
3610 proc_list_unlock();
3611 if (p) {
6d2010ae
A
3612 task_suspend(p->task);
3613 proc_rele(p);
39236c6e
A
3614 killed = TRUE;
3615 }
3616
3617 goto exit;
3618 } else
6d2010ae 3619#endif /* DEVELOPMENT || DEBUG */
39236c6e 3620 {
3e170ce0 3621 memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusKilledHiwat);
39236c6e 3622
3e170ce0
A
3623 if (proc_ref_locked(p) == p) {
3624 proc_list_unlock();
3625
3626 printf("memorystatus: jetsam killing pid %d [%s] (highwater %d) - memorystatus_available_pages: %d\n",
3627 aPid, (p->p_comm ? p->p_comm : "(unknown)"), aPid_ep, memorystatus_available_pages);
3628
3629 killed = memorystatus_do_kill(p, kMemorystatusKilledHiwat);
39236c6e 3630
3e170ce0
A
3631 /* Success? */
3632 if (killed) {
3633 proc_rele(p);
3634 kill_count++;
3635 goto exit;
3636 }
3637
3638 /*
3639 * Failure - first unwind the state,
3640 * then fall through to restart the search.
3641 */
3642 proc_list_lock();
3643 proc_rele_locked(p);
3644 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
3645 p->p_memstat_state |= P_MEMSTAT_ERROR;
3646 *errors += 1;
6d2010ae 3647 }
6d2010ae 3648
3e170ce0
A
3649 /*
3650 * Failure - restart the search.
3651 *
3652 * We might have raced with "p" exiting on another core, resulting in no
3653 * ref on "p". Or, we may have failed to kill "p".
3654 *
3655 * Either way, we fall thru to here, leaving the proc in the
3656 * P_MEMSTAT_TERMINATED state.
3657 *
3658 * And, we hold the the proc_list_lock at this point.
3659 */
3660
39236c6e
A
3661 i = 0;
3662 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
3663 }
6d2010ae
A
3664 }
3665 }
316670eb 3666
39236c6e 3667 proc_list_unlock();
316670eb 3668
39236c6e
A
3669exit:
3670 /* Clear snapshot if freshly captured and no target was found */
3671 if (new_snapshot && !killed) {
3672 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
316670eb
A
3673 }
3674
39236c6e 3675 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_END,
3e170ce0 3676 memorystatus_available_pages, killed ? aPid : 0, kill_count, 0, 0);
6d2010ae 3677
39236c6e 3678 return killed;
316670eb 3679}
2d21ac55 3680
39236c6e 3681#endif /* LEGACY_HIWATER */
316670eb 3682
39236c6e
A
3683static boolean_t
3684memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause) {
3685 /* TODO: allow a general async path */
fe8ab488
A
3686 if ((victim_pid != -1) || (cause != kMemorystatusKilledVMPageShortage && cause != kMemorystatusKilledVMThrashing &&
3687 cause != kMemorystatusKilledFCThrashing)) {
39236c6e 3688 return FALSE;
316670eb 3689 }
39236c6e 3690
fe8ab488 3691 kill_under_pressure_cause = cause;
39236c6e
A
3692 memorystatus_thread_wake();
3693 return TRUE;
3694}
2d21ac55 3695
39236c6e
A
3696static boolean_t
3697memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause) {
3698 boolean_t res;
3699 uint32_t errors = 0;
3700
3701 if (victim_pid == -1) {
3702 /* No pid, so kill first process */
3e170ce0 3703 res = memorystatus_kill_top_process(TRUE, TRUE, cause, NULL, &errors);
39236c6e
A
3704 } else {
3705 res = memorystatus_kill_specific_process(victim_pid, cause);
3706 }
3707
3708 if (errors) {
3709 memorystatus_clear_errors();
3710 }
3711
3712 if (res == TRUE) {
3713 /* Fire off snapshot notification */
3714 size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
3715 sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_count;
3e170ce0
A
3716 uint64_t timestamp_now = mach_absolute_time();
3717 memorystatus_jetsam_snapshot->notification_time = timestamp_now;
3718 if (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
3719 timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout) {
3720 int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
3721 if (!ret) {
3722 proc_list_lock();
3723 memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
3724 proc_list_unlock();
3725 }
3726 }
39236c6e
A
3727 }
3728
3729 return res;
3730}
b0d623f7 3731
39236c6e
A
3732boolean_t
3733memorystatus_kill_on_VM_page_shortage(boolean_t async) {
3734 if (async) {
3735 return memorystatus_kill_process_async(-1, kMemorystatusKilledVMPageShortage);
3736 } else {
3737 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMPageShortage);
3738 }
3739}
2d21ac55 3740
39236c6e
A
3741boolean_t
3742memorystatus_kill_on_VM_thrashing(boolean_t async) {
3743 if (async) {
3744 return memorystatus_kill_process_async(-1, kMemorystatusKilledVMThrashing);
3745 } else {
3746 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMThrashing);
2d21ac55
A
3747 }
3748}
b0d623f7 3749
fe8ab488
A
3750boolean_t
3751memorystatus_kill_on_FC_thrashing(boolean_t async) {
3752 if (async) {
3753 return memorystatus_kill_process_async(-1, kMemorystatusKilledFCThrashing);
3754 } else {
3755 return memorystatus_kill_process_sync(-1, kMemorystatusKilledFCThrashing);
3756 }
3757}
3758
39236c6e
A
3759boolean_t
3760memorystatus_kill_on_vnode_limit(void) {
3761 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVnodes);
3762}
3763
316670eb
A
3764#endif /* CONFIG_JETSAM */
3765
6d2010ae
A
3766#if CONFIG_FREEZE
3767
3768__private_extern__ void
316670eb 3769memorystatus_freeze_init(void)
6d2010ae 3770{
316670eb
A
3771 kern_return_t result;
3772 thread_t thread;
3e170ce0
A
3773
3774 freezer_lck_grp_attr = lck_grp_attr_alloc_init();
3775 freezer_lck_grp = lck_grp_alloc_init("freezer", freezer_lck_grp_attr);
3776
3777 lck_mtx_init(&freezer_mutex, freezer_lck_grp, NULL);
39236c6e 3778
316670eb
A
3779 result = kernel_thread_start(memorystatus_freeze_thread, NULL, &thread);
3780 if (result == KERN_SUCCESS) {
3781 thread_deallocate(thread);
3782 } else {
3783 panic("Could not create memorystatus_freeze_thread");
3784 }
6d2010ae
A
3785}
3786
3e170ce0
A
3787/*
3788 * Synchronously freeze the passed proc. Called with a reference to the proc held.
3789 *
3790 * Returns EINVAL or the value returned by task_freeze().
3791 */
3792int
3793memorystatus_freeze_process_sync(proc_t p)
3794{
3795 int ret = EINVAL;
3796 pid_t aPid = 0;
3797 boolean_t memorystatus_freeze_swap_low = FALSE;
3798
3799 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START,
3800 memorystatus_available_pages, 0, 0, 0, 0);
3801
3802 lck_mtx_lock(&freezer_mutex);
3803
3804 if (p == NULL) {
3805 goto exit;
3806 }
3807
3808 if (memorystatus_freeze_enabled == FALSE) {
3809 goto exit;
3810 }
3811
3812 if (!memorystatus_can_freeze(&memorystatus_freeze_swap_low)) {
3813 goto exit;
3814 }
3815
3816 if (memorystatus_freeze_update_throttle()) {
3817 printf("memorystatus_freeze_process_sync: in throttle, ignorning freeze\n");
3818 memorystatus_freeze_throttle_count++;
3819 goto exit;
3820 }
3821
3822 proc_list_lock();
3823
3824 if (p != NULL) {
3825 uint32_t purgeable, wired, clean, dirty, state;
3826 uint32_t max_pages, pages, i;
3827 boolean_t shared;
3828
3829 aPid = p->p_pid;
3830 state = p->p_memstat_state;
3831
3832 /* Ensure the process is eligible for freezing */
3833 if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FROZEN)) || !(state & P_MEMSTAT_SUSPENDED)) {
3834 proc_list_unlock();
3835 goto exit;
3836 }
3837
3838 /* Only freeze processes meeting our minimum resident page criteria */
3839 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
3840 if (pages < memorystatus_freeze_pages_min) {
3841 proc_list_unlock();
3842 goto exit;
3843 }
3844
3845 if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) {
3846
3847 unsigned int avail_swap_space = 0; /* in pages. */
3848
3849 if (DEFAULT_FREEZER_IS_ACTIVE) {
3850 /*
3851 * Freezer backed by default pager and swap file(s).
3852 */
3853 avail_swap_space = default_pager_swap_pages_free();
3854 } else {
3855 /*
3856 * Freezer backed by the compressor and swap file(s)
3857 * while will hold compressed data.
3858 */
3859 avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64;
3860 }
3861
3862 max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max);
3863
3864 if (max_pages < memorystatus_freeze_pages_min) {
3865 proc_list_unlock();
3866 goto exit;
3867 }
3868 } else {
3869 /*
3870 * We only have the compressor without any swap.
3871 */
3872 max_pages = UINT32_MAX - 1;
3873 }
3874
3875 /* Mark as locked temporarily to avoid kill */
3876 p->p_memstat_state |= P_MEMSTAT_LOCKED;
3877 proc_list_unlock();
3878
3879 ret = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
3880
3881 MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_process_sync: task_freeze %s for pid %d [%s] - "
3882 "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, shared %d, free swap: %d\n",
3883 (ret == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (p->p_comm ? p->p_comm : "(unknown)"),
3884 memorystatus_available_pages, purgeable, wired, clean, dirty, shared, default_pager_swap_pages_free());
3885
3886 proc_list_lock();
3887 p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
3888
3889 if (ret == KERN_SUCCESS) {
3890 memorystatus_freeze_entry_t data = { aPid, TRUE, dirty };
3891
3892 memorystatus_frozen_count++;
3893
3894 p->p_memstat_state |= (P_MEMSTAT_FROZEN | (shared ? 0: P_MEMSTAT_NORECLAIM));
3895
3896 if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) {
3897 /* Update stats */
3898 for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
3899 throttle_intervals[i].pageouts += dirty;
3900 }
3901 }
3902
3903 memorystatus_freeze_pageouts += dirty;
3904 memorystatus_freeze_count++;
3905
3906 proc_list_unlock();
3907
3908 memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
3909 } else {
3910 proc_list_unlock();
3911 }
3912 }
3913
3914exit:
3915 lck_mtx_unlock(&freezer_mutex);
3916 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END,
3917 memorystatus_available_pages, aPid, 0, 0, 0);
3918
3919 return ret;
3920}
3921
316670eb 3922static int
39236c6e 3923memorystatus_freeze_top_process(boolean_t *memorystatus_freeze_swap_low)
6d2010ae 3924{
39236c6e
A
3925 pid_t aPid = 0;
3926 int ret = -1;
3927 proc_t p = PROC_NULL, next_p = PROC_NULL;
3928 unsigned int i = 0;
6d2010ae 3929
39236c6e
A
3930 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START,
3931 memorystatus_available_pages, 0, 0, 0, 0);
3932
3933 proc_list_lock();
6d2010ae 3934
39236c6e
A
3935 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
3936 while (next_p) {
3937 kern_return_t kr;
3938 uint32_t purgeable, wired, clean, dirty;
3939 boolean_t shared;
3940 uint32_t pages;
3941 uint32_t max_pages = 0;
316670eb
A
3942 uint32_t state;
3943
39236c6e
A
3944 p = next_p;
3945 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
6d2010ae 3946
39236c6e
A
3947 aPid = p->p_pid;
3948 state = p->p_memstat_state;
6d2010ae 3949
316670eb 3950 /* Ensure the process is eligible for freezing */
39236c6e 3951 if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FROZEN)) || !(state & P_MEMSTAT_SUSPENDED)) {
316670eb
A
3952 continue; // with lock held
3953 }
316670eb 3954
39236c6e 3955 /* Only freeze processes meeting our minimum resident page criteria */
fe8ab488 3956 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
39236c6e
A
3957 if (pages < memorystatus_freeze_pages_min) {
3958 continue; // with lock held
3959 }
6d2010ae 3960
fe8ab488 3961 if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) {
3e170ce0
A
3962
3963 /* Ensure there's enough free space to freeze this process. */
3964
3965 unsigned int avail_swap_space = 0; /* in pages. */
3966
3967 if (DEFAULT_FREEZER_IS_ACTIVE) {
3968 /*
3969 * Freezer backed by default pager and swap file(s).
3970 */
3971 avail_swap_space = default_pager_swap_pages_free();
3972 } else {
3973 /*
3974 * Freezer backed by the compressor and swap file(s)
3975 * while will hold compressed data.
3976 */
3977 avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64;
3978 }
3979
3980 max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max);
3981
316670eb
A
3982 if (max_pages < memorystatus_freeze_pages_min) {
3983 *memorystatus_freeze_swap_low = TRUE;
39236c6e
A
3984 proc_list_unlock();
3985 goto exit;
316670eb 3986 }
39236c6e 3987 } else {
3e170ce0
A
3988 /*
3989 * We only have the compressor pool.
3990 */
39236c6e
A
3991 max_pages = UINT32_MAX - 1;
3992 }
3993
3994 /* Mark as locked temporarily to avoid kill */
3995 p->p_memstat_state |= P_MEMSTAT_LOCKED;
3996
3997 p = proc_ref_locked(p);
3998 proc_list_unlock();
3999 if (!p) {
4000 goto exit;
4001 }
4002
4003 kr = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
4004
4005 MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_top_process: task_freeze %s for pid %d [%s] - "
4006 "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, shared %d, free swap: %d\n",
4007 (kr == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (p->p_comm ? p->p_comm : "(unknown)"),
4008 memorystatus_available_pages, purgeable, wired, clean, dirty, shared, default_pager_swap_pages_free());
4009
4010 proc_list_lock();
4011 p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
4012
4013 /* Success? */
4014 if (KERN_SUCCESS == kr) {
4015 memorystatus_freeze_entry_t data = { aPid, TRUE, dirty };
316670eb 4016
39236c6e 4017 memorystatus_frozen_count++;
316670eb 4018
39236c6e
A
4019 p->p_memstat_state |= (P_MEMSTAT_FROZEN | (shared ? 0: P_MEMSTAT_NORECLAIM));
4020
3e170ce0
A
4021 if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) {
4022 /* Update stats */
4023 for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
4024 throttle_intervals[i].pageouts += dirty;
4025 }
39236c6e 4026 }
3e170ce0 4027
39236c6e
A
4028 memorystatus_freeze_pageouts += dirty;
4029 memorystatus_freeze_count++;
4030
4031 proc_list_unlock();
6d2010ae 4032
39236c6e 4033 memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
6d2010ae 4034
3e170ce0
A
4035 /* Return KERN_SUCESS */
4036 ret = kr;
6d2010ae 4037
39236c6e
A
4038 } else {
4039 proc_list_unlock();
316670eb 4040 }
39236c6e
A
4041
4042 proc_rele(p);
4043 goto exit;
6d2010ae 4044 }
316670eb 4045
39236c6e
A
4046 proc_list_unlock();
4047
4048exit:
4049 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END,
4050 memorystatus_available_pages, aPid, 0, 0, 0);
316670eb 4051
39236c6e 4052 return ret;
6d2010ae
A
4053}
4054
316670eb
A
4055static inline boolean_t
4056memorystatus_can_freeze_processes(void)
6d2010ae 4057{
316670eb 4058 boolean_t ret;
6d2010ae 4059
39236c6e 4060 proc_list_lock();
316670eb
A
4061
4062 if (memorystatus_suspended_count) {
4063 uint32_t average_resident_pages, estimated_processes;
4064
4065 /* Estimate the number of suspended processes we can fit */
39236c6e 4066 average_resident_pages = memorystatus_suspended_footprint_total / memorystatus_suspended_count;
316670eb
A
4067 estimated_processes = memorystatus_suspended_count +
4068 ((memorystatus_available_pages - memorystatus_available_pages_critical) / average_resident_pages);
4069
4070 /* If it's predicted that no freeze will occur, lower the threshold temporarily */
4071 if (estimated_processes <= FREEZE_SUSPENDED_THRESHOLD_DEFAULT) {
4072 memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_LOW;
6d2010ae 4073 } else {
39236c6e 4074 memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT;
6d2010ae 4075 }
6d2010ae 4076
316670eb
A
4077 MEMORYSTATUS_DEBUG(1, "memorystatus_can_freeze_processes: %d suspended processes, %d average resident pages / process, %d suspended processes estimated\n",
4078 memorystatus_suspended_count, average_resident_pages, estimated_processes);
6d2010ae 4079
316670eb
A
4080 if ((memorystatus_suspended_count - memorystatus_frozen_count) > memorystatus_freeze_suspended_threshold) {
4081 ret = TRUE;
4082 } else {
4083 ret = FALSE;
6d2010ae 4084 }
316670eb
A
4085 } else {
4086 ret = FALSE;
6d2010ae 4087 }
316670eb 4088
39236c6e 4089 proc_list_unlock();
6d2010ae 4090
316670eb 4091 return ret;
6d2010ae
A
4092}
4093
316670eb
A
4094static boolean_t
4095memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low)
6d2010ae 4096{
3e170ce0
A
4097 boolean_t can_freeze = TRUE;
4098
316670eb
A
4099 /* Only freeze if we're sufficiently low on memory; this holds off freeze right
4100 after boot, and is generally is a no-op once we've reached steady state. */
4101 if (memorystatus_available_pages > memorystatus_freeze_threshold) {
4102 return FALSE;
4103 }
4104
4105 /* Check minimum suspended process threshold. */
4106 if (!memorystatus_can_freeze_processes()) {
4107 return FALSE;
4108 }
6d2010ae 4109
3e170ce0
A
4110 if (COMPRESSED_PAGER_IS_SWAPLESS || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPLESS) {
4111 /*
4112 * In-core compressor used for freezing WITHOUT on-disk swap support.
4113 */
4114
4115 if (vm_compressor_low_on_space()) {
4116 if (*memorystatus_freeze_swap_low) {
4117 *memorystatus_freeze_swap_low = TRUE;
4118 }
4119
4120 can_freeze = FALSE;
4121
4122 } else {
4123 if (*memorystatus_freeze_swap_low) {
4124 *memorystatus_freeze_swap_low = FALSE;
4125 }
4126
4127 can_freeze = TRUE;
4128 }
4129 } else {
4130 /*
4131 * Freezing WITH on-disk swap support.
4132 */
4133
4134 if (DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) {
4135 /*
4136 * In-core compressor fronts the swap.
4137 */
4138 if (vm_swap_low_on_space()) {
4139 if (*memorystatus_freeze_swap_low) {
4140 *memorystatus_freeze_swap_low = TRUE;
4141 }
4142
4143 can_freeze = FALSE;
4144 }
4145
4146 } else if (DEFAULT_FREEZER_IS_ACTIVE) {
4147 /*
4148 * Legacy freeze mode with no compressor support.
4149 */
4150 if (default_pager_swap_pages_free() < memorystatus_freeze_pages_min) {
4151 if (*memorystatus_freeze_swap_low) {
4152 *memorystatus_freeze_swap_low = TRUE;
4153 }
4154
4155 can_freeze = FALSE;
4156 }
4157 } else {
4158 panic("Not a valid freeze configuration.\n");
316670eb 4159 }
6d2010ae
A
4160 }
4161
3e170ce0 4162 return can_freeze;
6d2010ae
A
4163}
4164
4165static void
316670eb 4166memorystatus_freeze_update_throttle_interval(mach_timespec_t *ts, struct throttle_interval_t *interval)
6d2010ae 4167{
3e170ce0 4168 unsigned int freeze_daily_pageouts_max = memorystatus_freeze_daily_mb_max * (1024 * 1024 / PAGE_SIZE);
6d2010ae
A
4169 if (CMP_MACH_TIMESPEC(ts, &interval->ts) >= 0) {
4170 if (!interval->max_pageouts) {
3e170ce0 4171 interval->max_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * freeze_daily_pageouts_max) / (24 * 60)));
6d2010ae 4172 } else {
316670eb 4173 printf("memorystatus_freeze_update_throttle_interval: %d minute throttle timeout, resetting\n", interval->mins);
6d2010ae
A
4174 }
4175 interval->ts.tv_sec = interval->mins * 60;
4176 interval->ts.tv_nsec = 0;
4177 ADD_MACH_TIMESPEC(&interval->ts, ts);
316670eb 4178 /* Since we update the throttle stats pre-freeze, adjust for overshoot here */
6d2010ae
A
4179 if (interval->pageouts > interval->max_pageouts) {
4180 interval->pageouts -= interval->max_pageouts;
4181 } else {
4182 interval->pageouts = 0;
4183 }
4184 interval->throttle = FALSE;
4185 } else if (!interval->throttle && interval->pageouts >= interval->max_pageouts) {
316670eb 4186 printf("memorystatus_freeze_update_throttle_interval: %d minute pageout limit exceeded; enabling throttle\n", interval->mins);
6d2010ae
A
4187 interval->throttle = TRUE;
4188 }
316670eb
A
4189
4190 MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_update_throttle_interval: throttle updated - %d frozen (%d max) within %dm; %dm remaining; throttle %s\n",
6d2010ae
A
4191 interval->pageouts, interval->max_pageouts, interval->mins, (interval->ts.tv_sec - ts->tv_sec) / 60,
4192 interval->throttle ? "on" : "off");
6d2010ae
A
4193}
4194
4195static boolean_t
316670eb 4196memorystatus_freeze_update_throttle(void)
6d2010ae
A
4197{
4198 clock_sec_t sec;
4199 clock_nsec_t nsec;
4200 mach_timespec_t ts;
4201 uint32_t i;
4202 boolean_t throttled = FALSE;
4203
4204#if DEVELOPMENT || DEBUG
316670eb 4205 if (!memorystatus_freeze_throttle_enabled)
6d2010ae
A
4206 return FALSE;
4207#endif
4208
4209 clock_get_system_nanotime(&sec, &nsec);
4210 ts.tv_sec = sec;
4211 ts.tv_nsec = nsec;
4212
316670eb 4213 /* Check freeze pageouts over multiple intervals and throttle if we've exceeded our budget.
6d2010ae 4214 *
316670eb 4215 * This ensures that periods of inactivity can't be used as 'credit' towards freeze if the device has
6d2010ae
A
4216 * remained dormant for a long period. We do, however, allow increased thresholds for shorter intervals in
4217 * order to allow for bursts of activity.
4218 */
4219 for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
316670eb 4220 memorystatus_freeze_update_throttle_interval(&ts, &throttle_intervals[i]);
6d2010ae
A
4221 if (throttle_intervals[i].throttle == TRUE)
4222 throttled = TRUE;
4223 }
4224
4225 return throttled;
4226}
4227
4228static void
316670eb 4229memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused)
6d2010ae 4230{
316670eb 4231 static boolean_t memorystatus_freeze_swap_low = FALSE;
3e170ce0
A
4232
4233 lck_mtx_lock(&freezer_mutex);
316670eb
A
4234 if (memorystatus_freeze_enabled) {
4235 if (memorystatus_can_freeze(&memorystatus_freeze_swap_low)) {
3e170ce0
A
4236 /* Only freeze if we've not exceeded our pageout budgets.*/
4237 if (!memorystatus_freeze_update_throttle()) {
39236c6e 4238 memorystatus_freeze_top_process(&memorystatus_freeze_swap_low);
316670eb
A
4239 } else {
4240 printf("memorystatus_freeze_thread: in throttle, ignoring freeze\n");
4241 memorystatus_freeze_throttle_count++; /* Throttled, update stats */
4242 }
4243 }
4244 }
3e170ce0 4245 lck_mtx_unlock(&freezer_mutex);
6d2010ae 4246
316670eb
A
4247 assert_wait((event_t) &memorystatus_freeze_wakeup, THREAD_UNINT);
4248 thread_block((thread_continue_t) memorystatus_freeze_thread);
4249}
4250
4251#endif /* CONFIG_FREEZE */
6d2010ae 4252
fe8ab488 4253#if VM_PRESSURE_EVENTS
6d2010ae 4254
fe8ab488 4255#if CONFIG_MEMORYSTATUS
316670eb 4256
fe8ab488
A
4257static int
4258memorystatus_send_note(int event_code, void *data, size_t data_length) {
4259 int ret;
4260 struct kev_msg ev_msg;
316670eb 4261
fe8ab488
A
4262 ev_msg.vendor_code = KEV_VENDOR_APPLE;
4263 ev_msg.kev_class = KEV_SYSTEM_CLASS;
4264 ev_msg.kev_subclass = KEV_MEMORYSTATUS_SUBCLASS;
4265
4266 ev_msg.event_code = event_code;
4267
4268 ev_msg.dv[0].data_length = data_length;
4269 ev_msg.dv[0].data_ptr = data;
4270 ev_msg.dv[1].data_length = 0;
4271
4272 ret = kev_post_msg(&ev_msg);
4273 if (ret) {
4274 printf("%s: kev_post_msg() failed, err %d\n", __func__, ret);
316670eb 4275 }
39236c6e 4276
fe8ab488 4277 return ret;
316670eb
A
4278}
4279
fe8ab488
A
4280boolean_t
4281memorystatus_warn_process(pid_t pid, boolean_t critical) {
316670eb 4282
fe8ab488 4283 boolean_t ret = FALSE;
3e170ce0 4284 boolean_t found_knote = FALSE;
fe8ab488 4285 struct knote *kn = NULL;
316670eb 4286
fe8ab488
A
4287 /*
4288 * See comment in sysctl_memorystatus_vm_pressure_send.
4289 */
39236c6e 4290
fe8ab488 4291 memorystatus_klist_lock();
3e170ce0
A
4292
4293 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
4294 proc_t knote_proc = kn->kn_kq->kq_p;
4295 pid_t knote_pid = knote_proc->p_pid;
4296
4297 if (knote_pid == pid) {
4298 /*
4299 * By setting the "fflags" here, we are forcing
4300 * a process to deal with the case where it's
4301 * bumping up into its memory limits. If we don't
4302 * do this here, we will end up depending on the
4303 * system pressure snapshot evaluation in
4304 * filt_memorystatus().
4305 */
39236c6e 4306
3e170ce0
A
4307 if (critical) {
4308 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
4309 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
4310 } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
4311 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
4312 }
4313 } else {
4314 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
4315 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
4316 }
4317 }
4318
4319 found_knote = TRUE;
39236c6e 4320 }
3e170ce0
A
4321 }
4322
4323 if (found_knote) {
4324 KNOTE(&memorystatus_klist, 0);
4325 ret = TRUE;
fe8ab488
A
4326 } else {
4327 if (vm_dispatch_pressure_note_to_pid(pid, FALSE) == 0) {
4328 ret = TRUE;
6d2010ae
A
4329 }
4330 }
3e170ce0 4331
fe8ab488 4332 memorystatus_klist_unlock();
6d2010ae 4333
fe8ab488 4334 return ret;
316670eb
A
4335}
4336
3e170ce0
A
4337/*
4338 * Can only be set by the current task on itself.
4339 */
4340int
4341memorystatus_low_mem_privileged_listener(uint32_t op_flags)
4342{
4343 boolean_t set_privilege = FALSE;
4344 /*
4345 * Need an entitlement check here?
4346 */
4347 if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE) {
4348 set_privilege = TRUE;
4349 } else if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE) {
4350 set_privilege = FALSE;
4351 } else {
4352 return EINVAL;
4353 }
4354
4355 return (task_low_mem_privileged_listener(current_task(), set_privilege, NULL));
4356}
4357
39236c6e 4358int
316670eb 4359memorystatus_send_pressure_note(pid_t pid) {
39236c6e
A
4360 MEMORYSTATUS_DEBUG(1, "memorystatus_send_pressure_note(): pid %d\n", pid);
4361 return memorystatus_send_note(kMemorystatusPressureNote, &pid, sizeof(pid));
6d2010ae
A
4362}
4363
fe8ab488
A
4364void
4365memorystatus_send_low_swap_note(void) {
4366
4367 struct knote *kn = NULL;
3e170ce0 4368
fe8ab488
A
4369 memorystatus_klist_lock();
4370 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
3e170ce0
A
4371 /* We call is_knote_registered_modify_task_pressure_bits to check if the sfflags for the
4372 * current note contain NOTE_MEMORYSTATUS_LOW_SWAP. Once we find one note in the memorystatus_klist
4373 * that has the NOTE_MEMORYSTATUS_LOW_SWAP flags in its sfflags set, we call KNOTE with
4374 * kMemoryStatusLowSwap as the hint to process and update all knotes on the memorystatus_klist accordingly. */
fe8ab488 4375 if (is_knote_registered_modify_task_pressure_bits(kn, NOTE_MEMORYSTATUS_LOW_SWAP, NULL, 0, 0) == TRUE) {
3e170ce0
A
4376 KNOTE(&memorystatus_klist, kMemorystatusLowSwap);
4377 break;
fe8ab488
A
4378 }
4379 }
3e170ce0 4380
fe8ab488
A
4381 memorystatus_klist_unlock();
4382}
4383
39236c6e
A
4384boolean_t
4385memorystatus_bg_pressure_eligible(proc_t p) {
4386 boolean_t eligible = FALSE;
4387
4388 proc_list_lock();
4389
4390 MEMORYSTATUS_DEBUG(1, "memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", p->p_pid, p->p_memstat_state);
4391
4392 /* Foreground processes have already been dealt with at this point, so just test for eligibility */
4393 if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) {
4394 eligible = TRUE;
4395 }
4396
4397 proc_list_unlock();
4398
4399 return eligible;
4400}
4401
4402boolean_t
4403memorystatus_is_foreground_locked(proc_t p) {
4404 return ((p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) ||
4405 (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT));
4406}
fe8ab488 4407#endif /* CONFIG_MEMORYSTATUS */
39236c6e
A
4408
4409/*
4410 * Trigger levels to test the mechanism.
4411 * Can be used via a sysctl.
4412 */
4413#define TEST_LOW_MEMORY_TRIGGER_ONE 1
4414#define TEST_LOW_MEMORY_TRIGGER_ALL 2
4415#define TEST_PURGEABLE_TRIGGER_ONE 3
4416#define TEST_PURGEABLE_TRIGGER_ALL 4
4417#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE 5
4418#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL 6
4419
4420boolean_t memorystatus_manual_testing_on = FALSE;
4421vm_pressure_level_t memorystatus_manual_testing_level = kVMPressureNormal;
4422
4423extern struct knote *
fe8ab488 4424vm_pressure_select_optimal_candidate_to_notify(struct klist *, int, boolean_t);
39236c6e
A
4425
4426extern
fe8ab488 4427kern_return_t vm_pressure_notification_without_levels(boolean_t);
39236c6e
A
4428
4429extern void vm_pressure_klist_lock(void);
4430extern void vm_pressure_klist_unlock(void);
4431
4432extern void vm_reset_active_list(void);
4433
4434extern void delay(int);
4435
4436#define INTER_NOTIFICATION_DELAY (250000) /* .25 second */
4437
4438void memorystatus_on_pageout_scan_end(void) {
4439 /* No-op */
4440}
4441
4442/*
4443 * kn_max - knote
4444 *
4445 * knote_pressure_level - to check if the knote is registered for this notification level.
4446 *
4447 * task - task whose bits we'll be modifying
4448 *
4449 * pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again.
4450 *
4451 * pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately.
4452 *
4453 */
39236c6e
A
4454
4455boolean_t
4456is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set)
4457{
4458 if (kn_max->kn_sfflags & knote_pressure_level) {
4459
4460 if (task_has_been_notified(task, pressure_level_to_clear) == TRUE) {
4461
4462 task_clear_has_been_notified(task, pressure_level_to_clear);
4463 }
4464
4465 task_mark_has_been_notified(task, pressure_level_to_set);
4466 return TRUE;
4467 }
4468
4469 return FALSE;
4470}
4471
fe8ab488
A
4472extern kern_return_t vm_pressure_notify_dispatch_vm_clients(boolean_t target_foreground_process);
4473
4474#define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD 5000 /* milliseconds */
39236c6e
A
4475
4476kern_return_t
fe8ab488 4477memorystatus_update_vm_pressure(boolean_t target_foreground_process)
39236c6e
A
4478{
4479 struct knote *kn_max = NULL;
3e170ce0 4480 struct knote *kn_cur = NULL, *kn_temp = NULL; /* for safe list traversal */
39236c6e
A
4481 pid_t target_pid = -1;
4482 struct klist dispatch_klist = { NULL };
4483 proc_t target_proc = PROC_NULL;
39236c6e
A
4484 struct task *task = NULL;
4485 boolean_t found_candidate = FALSE;
4486
fe8ab488
A
4487 static vm_pressure_level_t level_snapshot = kVMPressureNormal;
4488 static vm_pressure_level_t prev_level_snapshot = kVMPressureNormal;
4489 boolean_t smoothing_window_started = FALSE;
4490 struct timeval smoothing_window_start_tstamp = {0, 0};
4491 struct timeval curr_tstamp = {0, 0};
4492 int elapsed_msecs = 0;
4493
4494#if !CONFIG_JETSAM
4495#define MAX_IDLE_KILLS 100 /* limit the number of idle kills allowed */
4496
4497 int idle_kill_counter = 0;
4498
4499 /*
4500 * On desktop we take this opportunity to free up memory pressure
4501 * by immediately killing idle exitable processes. We use a delay
4502 * to avoid overkill. And we impose a max counter as a fail safe
4503 * in case daemons re-launch too fast.
4504 */
4505 while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) {
4506 if (memorystatus_idle_exit_from_VM() == FALSE) {
4507 /* No idle exitable processes left to kill */
4508 break;
4509 }
4510 idle_kill_counter++;
3e170ce0
A
4511
4512 if (memorystatus_manual_testing_on == TRUE) {
4513 /*
4514 * Skip the delay when testing
4515 * the pressure notification scheme.
4516 */
4517 } else {
4518 delay(1000000); /* 1 second */
4519 }
fe8ab488
A
4520 }
4521#endif /* !CONFIG_JETSAM */
4522
39236c6e
A
4523 while (1) {
4524
4525 /*
4526 * There is a race window here. But it's not clear
4527 * how much we benefit from having extra synchronization.
4528 */
4529 level_snapshot = memorystatus_vm_pressure_level;
4530
fe8ab488
A
4531 if (prev_level_snapshot > level_snapshot) {
4532 /*
4533 * Pressure decreased? Let's take a little breather
4534 * and see if this condition stays.
4535 */
4536 if (smoothing_window_started == FALSE) {
4537
4538 smoothing_window_started = TRUE;
4539 microuptime(&smoothing_window_start_tstamp);
4540 }
4541
4542 microuptime(&curr_tstamp);
4543 timevalsub(&curr_tstamp, &smoothing_window_start_tstamp);
4544 elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
4545
4546 if (elapsed_msecs < VM_PRESSURE_DECREASED_SMOOTHING_PERIOD) {
4547
4548 delay(INTER_NOTIFICATION_DELAY);
4549 continue;
4550 }
4551 }
4552
4553 prev_level_snapshot = level_snapshot;
4554 smoothing_window_started = FALSE;
4555
39236c6e 4556 memorystatus_klist_lock();
fe8ab488 4557 kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot, target_foreground_process);
39236c6e
A
4558
4559 if (kn_max == NULL) {
4560 memorystatus_klist_unlock();
4561
4562 /*
4563 * No more level-based clients to notify.
4564 * Try the non-level based notification clients.
4565 *
4566 * However, these non-level clients don't understand
4567 * the "return-to-normal" notification.
4568 *
4569 * So don't consider them for those notifications. Just
4570 * return instead.
4571 *
4572 */
4573
4574 if (level_snapshot != kVMPressureNormal) {
4575 goto try_dispatch_vm_clients;
4576 } else {
4577 return KERN_FAILURE;
4578 }
4579 }
4580
4581 target_proc = kn_max->kn_kq->kq_p;
4582
4583 proc_list_lock();
4584 if (target_proc != proc_ref_locked(target_proc)) {
4585 target_proc = PROC_NULL;
4586 proc_list_unlock();
4587 memorystatus_klist_unlock();
4588 continue;
4589 }
4590 proc_list_unlock();
39236c6e
A
4591
4592 target_pid = target_proc->p_pid;
4593
4594 task = (struct task *)(target_proc->task);
4595
4596 if (level_snapshot != kVMPressureNormal) {
4597
4598 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
4599
4600 if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, kVMPressureCritical, kVMPressureWarning) == TRUE) {
4601 found_candidate = TRUE;
4602 }
4603 } else {
4604 if (level_snapshot == kVMPressureCritical) {
4605
4606 if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, kVMPressureWarning, kVMPressureCritical) == TRUE) {
4607 found_candidate = TRUE;
4608 }
4609 }
4610 }
4611 } else {
4612 if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
4613
4614 task_clear_has_been_notified(task, kVMPressureWarning);
4615 task_clear_has_been_notified(task, kVMPressureCritical);
4616
4617 found_candidate = TRUE;
6d2010ae
A
4618 }
4619 }
39236c6e
A
4620
4621 if (found_candidate == FALSE) {
3e170ce0
A
4622 proc_rele(target_proc);
4623 memorystatus_klist_unlock();
39236c6e
A
4624 continue;
4625 }
4626
3e170ce0
A
4627 SLIST_FOREACH_SAFE(kn_cur, &memorystatus_klist, kn_selnext, kn_temp) {
4628 proc_t knote_proc = kn_cur->kn_kq->kq_p;
4629 pid_t knote_pid = knote_proc->p_pid;
4630 if (knote_pid == target_pid) {
4631 KNOTE_DETACH(&memorystatus_klist, kn_cur);
4632 KNOTE_ATTACH(&dispatch_klist, kn_cur);
4633 }
4634 }
39236c6e
A
4635
4636 KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure);
4637
3e170ce0
A
4638 SLIST_FOREACH_SAFE(kn_cur, &dispatch_klist, kn_selnext, kn_temp) {
4639 KNOTE_DETACH(&dispatch_klist, kn_cur);
4640 KNOTE_ATTACH(&memorystatus_klist, kn_cur);
4641 }
4642
39236c6e
A
4643 memorystatus_klist_unlock();
4644
4645 microuptime(&target_proc->vm_pressure_last_notify_tstamp);
4646 proc_rele(target_proc);
4647
fe8ab488 4648 if (memorystatus_manual_testing_on == TRUE && target_foreground_process == TRUE) {
39236c6e
A
4649 break;
4650 }
4651
4652try_dispatch_vm_clients:
fe8ab488
A
4653 if (kn_max == NULL && level_snapshot != kVMPressureNormal) {
4654 /*
4655 * We will exit this loop when we are done with
4656 * notification clients (level and non-level based).
39236c6e 4657 */
fe8ab488 4658 if ((vm_pressure_notify_dispatch_vm_clients(target_foreground_process) == KERN_FAILURE) && (kn_max == NULL)) {
39236c6e
A
4659 /*
4660 * kn_max == NULL i.e. we didn't find any eligible clients for the level-based notifications
4661 * AND
4662 * we have failed to find any eligible clients for the non-level based notifications too.
4663 * So, we are done.
4664 */
4665
4666 return KERN_FAILURE;
4667 }
4668 }
4669
fe8ab488
A
4670 /*
4671 * LD: This block of code below used to be invoked in the older memory notification scheme on embedded everytime
4672 * a process was sent a memory pressure notification. The "memorystatus_klist" list was used to hold these
4673 * privileged listeners. But now we have moved to the newer scheme and are trying to move away from the extra
4674 * notifications. So the code is here in case we break compat. and need to send out notifications to the privileged
4675 * apps.
4676 */
4677#if 0
4678#endif /* 0 */
4679
4680 if (memorystatus_manual_testing_on == TRUE) {
4681 /*
4682 * Testing out the pressure notification scheme.
4683 * No need for delays etc.
4684 */
4685 } else {
4686
4687 uint32_t sleep_interval = INTER_NOTIFICATION_DELAY;
4688#if CONFIG_JETSAM
4689 unsigned int page_delta = 0;
4690 unsigned int skip_delay_page_threshold = 0;
4691
4692 assert(memorystatus_available_pages_pressure >= memorystatus_available_pages_critical_base);
4693
4694 page_delta = (memorystatus_available_pages_pressure - memorystatus_available_pages_critical_base) / 2;
4695 skip_delay_page_threshold = memorystatus_available_pages_pressure - page_delta;
4696
4697 if (memorystatus_available_pages <= skip_delay_page_threshold) {
4698 /*
4699 * We are nearing the critcal mark fast and can't afford to wait between
4700 * notifications.
4701 */
4702 sleep_interval = 0;
4703 }
4704#endif /* CONFIG_JETSAM */
4705
4706 if (sleep_interval) {
4707 delay(sleep_interval);
4708 }
39236c6e 4709 }
6d2010ae 4710 }
39236c6e
A
4711
4712 return KERN_SUCCESS;
6d2010ae
A
4713}
4714
39236c6e
A
4715vm_pressure_level_t
4716convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
4717
4718vm_pressure_level_t
4719convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)
4720{
4721 vm_pressure_level_t dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
4722
4723 switch (internal_pressure_level) {
4724
4725 case kVMPressureNormal:
4726 {
4727 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
4728 break;
4729 }
4730
4731 case kVMPressureWarning:
4732 case kVMPressureUrgent:
4733 {
4734 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN;
4735 break;
4736 }
4737
4738 case kVMPressureCritical:
4739 {
4740 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
4741 break;
4742 }
4743
4744 default:
4745 break;
4746 }
316670eb 4747
39236c6e
A
4748 return dispatch_level;
4749}
6d2010ae 4750
b0d623f7 4751static int
39236c6e 4752sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS
b0d623f7 4753{
39236c6e 4754#pragma unused(arg1, arg2, oidp)
39236c6e
A
4755 vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(memorystatus_vm_pressure_level);
4756
4757 return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level));
4758}
4759
fe8ab488
A
4760#if DEBUG || DEVELOPMENT
4761
39236c6e
A
4762SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_LOCKED,
4763 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
4764
fe8ab488
A
4765#else /* DEBUG || DEVELOPMENT */
4766
4767SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_LOCKED|CTLFLAG_MASKED,
4768 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
4769
4770#endif /* DEBUG || DEVELOPMENT */
b0d623f7 4771
39236c6e
A
4772extern int memorystatus_purge_on_warning;
4773extern int memorystatus_purge_on_critical;
4774
4775static int
4776sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
4777{
4778#pragma unused(arg1, arg2)
b0d623f7 4779
39236c6e
A
4780 int level = 0;
4781 int error = 0;
4782 int pressure_level = 0;
4783 int trigger_request = 0;
4784 int force_purge;
4785
4786 error = sysctl_handle_int(oidp, &level, 0, req);
4787 if (error || !req->newptr) {
4788 return (error);
4789 }
4790
4791 memorystatus_manual_testing_on = TRUE;
4792
4793 trigger_request = (level >> 16) & 0xFFFF;
4794 pressure_level = (level & 0xFFFF);
4795
4796 if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE ||
4797 trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) {
4798 return EINVAL;
4799 }
4800 switch (pressure_level) {
4801 case NOTE_MEMORYSTATUS_PRESSURE_NORMAL:
4802 case NOTE_MEMORYSTATUS_PRESSURE_WARN:
4803 case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL:
4804 break;
4805 default:
b0d623f7
A
4806 return EINVAL;
4807 }
b0d623f7 4808
39236c6e
A
4809 /*
4810 * The pressure level is being set from user-space.
4811 * And user-space uses the constants in sys/event.h
4812 * So we translate those events to our internal levels here.
4813 */
4814 if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
4815
4816 memorystatus_manual_testing_level = kVMPressureNormal;
4817 force_purge = 0;
4818
4819 } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) {
4820
4821 memorystatus_manual_testing_level = kVMPressureWarning;
4822 force_purge = memorystatus_purge_on_warning;
4823
4824 } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
4825
4826 memorystatus_manual_testing_level = kVMPressureCritical;
4827 force_purge = memorystatus_purge_on_critical;
b0d623f7
A
4828 }
4829
39236c6e 4830 memorystatus_vm_pressure_level = memorystatus_manual_testing_level;
316670eb 4831
39236c6e
A
4832 /* purge according to the new pressure level */
4833 switch (trigger_request) {
4834 case TEST_PURGEABLE_TRIGGER_ONE:
4835 case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE:
4836 if (force_purge == 0) {
4837 /* no purging requested */
4838 break;
4839 }
4840 vm_purgeable_object_purge_one_unlocked(force_purge);
4841 break;
4842 case TEST_PURGEABLE_TRIGGER_ALL:
4843 case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL:
4844 if (force_purge == 0) {
4845 /* no purging requested */
4846 break;
4847 }
4848 while (vm_purgeable_object_purge_one_unlocked(force_purge));
4849 break;
4850 }
4851
4852 if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) ||
4853 (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) {
4854
4855 memorystatus_update_vm_pressure(TRUE);
4856 }
4857
4858 if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) ||
4859 (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) {
4860
4861 while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) {
4862 continue;
4863 }
4864 }
4865
4866 if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
4867 memorystatus_manual_testing_on = FALSE;
4868
4869 vm_pressure_klist_lock();
4870 vm_reset_active_list();
4871 vm_pressure_klist_unlock();
4872 } else {
4873
4874 vm_pressure_klist_lock();
fe8ab488 4875 vm_pressure_notification_without_levels(FALSE);
39236c6e
A
4876 vm_pressure_klist_unlock();
4877 }
4878
4879 return 0;
b0d623f7
A
4880}
4881
39236c6e
A
4882SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
4883 0, 0, &sysctl_memorypressure_manual_trigger, "I", "");
4884
4885
4886extern int memorystatus_purge_on_warning;
4887extern int memorystatus_purge_on_urgent;
4888extern int memorystatus_purge_on_critical;
4889
4890SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_warning, 0, "");
4891SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_urgent, 0, "");
4892SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_critical, 0, "");
4893
4894
fe8ab488 4895#endif /* VM_PRESSURE_EVENTS */
39236c6e
A
4896
4897/* Return both allocated and actual size, since there's a race between allocation and list compilation */
b0d623f7 4898static int
39236c6e 4899memorystatus_get_priority_list(memorystatus_priority_entry_t **list_ptr, size_t *buffer_size, size_t *list_size, boolean_t size_only)
b0d623f7 4900{
316670eb 4901 uint32_t list_count, i = 0;
39236c6e
A
4902 memorystatus_priority_entry_t *list_entry;
4903 proc_t p;
4904
316670eb 4905 list_count = memorystatus_list_count;
39236c6e
A
4906 *list_size = sizeof(memorystatus_priority_entry_t) * list_count;
4907
4908 /* Just a size check? */
4909 if (size_only) {
4910 return 0;
4911 }
4912
4913 /* Otherwise, validate the size of the buffer */
4914 if (*buffer_size < *list_size) {
4915 return EINVAL;
4916 }
4917
4918 *list_ptr = (memorystatus_priority_entry_t*)kalloc(*list_size);
4919 if (!list_ptr) {
316670eb
A
4920 return ENOMEM;
4921 }
4922
39236c6e
A
4923 memset(*list_ptr, 0, *list_size);
4924
4925 *buffer_size = *list_size;
4926 *list_size = 0;
4927
4928 list_entry = *list_ptr;
4929
4930 proc_list_lock();
4931
4932 p = memorystatus_get_first_proc_locked(&i, TRUE);
4933 while (p && (*list_size < *buffer_size)) {
4934 list_entry->pid = p->p_pid;
4935 list_entry->priority = p->p_memstat_effectivepriority;
4936 list_entry->user_data = p->p_memstat_userdata;
4937#if LEGACY_HIWATER
3e170ce0
A
4938
4939 /*
4940 * No need to consider P_MEMSTAT_MEMLIMIT_BACKGROUND anymore.
4941 * Background limits are described via the inactive limit slots.
4942 * So, here, the cached limit should always be valid.
4943 */
4944
4945 if (p->p_memstat_memlimit <= 0) {
4946 task_get_phys_footprint_limit(p->task, &list_entry->limit);
4947 } else {
4948 list_entry->limit = p->p_memstat_memlimit;
4949 }
39236c6e
A
4950#else
4951 task_get_phys_footprint_limit(p->task, &list_entry->limit);
4952#endif
4953 list_entry->state = memorystatus_build_state(p);
4954 list_entry++;
4955
4956 *list_size += sizeof(memorystatus_priority_entry_t);
4957
4958 p = memorystatus_get_next_proc_locked(&i, p, TRUE);
316670eb
A
4959 }
4960
39236c6e 4961 proc_list_unlock();
316670eb 4962
39236c6e 4963 MEMORYSTATUS_DEBUG(1, "memorystatus_get_priority_list: returning %lu for size\n", (unsigned long)*list_size);
316670eb 4964
39236c6e
A
4965 return 0;
4966}
b0d623f7 4967
39236c6e
A
4968static int
4969memorystatus_cmd_get_priority_list(user_addr_t buffer, size_t buffer_size, int32_t *retval) {
4970 int error = EINVAL;
4971 boolean_t size_only;
4972 memorystatus_priority_entry_t *list = NULL;
4973 size_t list_size;
316670eb 4974
39236c6e
A
4975 size_only = ((buffer == USER_ADDR_NULL) ? TRUE: FALSE);
4976
4977 error = memorystatus_get_priority_list(&list, &buffer_size, &list_size, size_only);
4978 if (error) {
4979 goto out;
4980 }
4981
4982 if (!size_only) {
4983 error = copyout(list, buffer, list_size);
4984 }
4985
4986 if (error == 0) {
4987 *retval = list_size;
4988 }
4989out:
4990
4991 if (list) {
4992 kfree(list, buffer_size);
4993 }
4994
4995 return error;
316670eb 4996}
b0d623f7 4997
39236c6e
A
4998#if CONFIG_JETSAM
4999
5000static void
5001memorystatus_clear_errors(void)
5002{
5003 proc_t p;
5004 unsigned int i = 0;
5005
5006 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_START, 0, 0, 0, 0, 0);
5007
5008 proc_list_lock();
5009
5010 p = memorystatus_get_first_proc_locked(&i, TRUE);
5011 while (p) {
5012 if (p->p_memstat_state & P_MEMSTAT_ERROR) {
5013 p->p_memstat_state &= ~P_MEMSTAT_ERROR;
5014 }
5015 p = memorystatus_get_next_proc_locked(&i, p, TRUE);
5016 }
5017
5018 proc_list_unlock();
5019
5020 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_END, 0, 0, 0, 0, 0);
5021}
b0d623f7 5022
316670eb 5023static void
39236c6e 5024memorystatus_update_levels_locked(boolean_t critical_only) {
fe8ab488 5025
39236c6e 5026 memorystatus_available_pages_critical = memorystatus_available_pages_critical_base;
fe8ab488
A
5027
5028 /*
5029 * If there's an entry in the first bucket, we have idle processes.
5030 */
5031 memstat_bucket_t *first_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
5032 if (first_bucket->count) {
5033 memorystatus_available_pages_critical += memorystatus_available_pages_critical_idle_offset;
5034
5035 if (memorystatus_available_pages_critical > memorystatus_available_pages_pressure ) {
5036 /*
5037 * The critical threshold must never exceed the pressure threshold
5038 */
5039 memorystatus_available_pages_critical = memorystatus_available_pages_pressure;
39236c6e
A
5040 }
5041 }
fe8ab488 5042
316670eb
A
5043#if DEBUG || DEVELOPMENT
5044 if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
5045 memorystatus_available_pages_critical += memorystatus_jetsam_policy_offset_pages_diagnostic;
fe8ab488
A
5046
5047 if (memorystatus_available_pages_critical > memorystatus_available_pages_pressure ) {
5048 /*
5049 * The critical threshold must never exceed the pressure threshold
5050 */
5051 memorystatus_available_pages_critical = memorystatus_available_pages_pressure;
5052 }
39236c6e
A
5053 }
5054#endif
5055
5056 if (critical_only) {
5057 return;
5058 }
5059
316670eb 5060#if VM_PRESSURE_EVENTS
39236c6e
A
5061 memorystatus_available_pages_pressure = (pressure_threshold_percentage / delta_percentage) * memorystatus_delta;
5062#if DEBUG || DEVELOPMENT
5063 if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
316670eb 5064 memorystatus_available_pages_pressure += memorystatus_jetsam_policy_offset_pages_diagnostic;
316670eb
A
5065 }
5066#endif
39236c6e
A
5067#endif
5068}
5069
3e170ce0
A
5070/*
5071 * Get the at_boot snapshot
5072 */
39236c6e 5073static int
3e170ce0 5074memorystatus_get_at_boot_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only) {
39236c6e 5075 size_t input_size = *snapshot_size;
3e170ce0
A
5076
5077 /*
5078 * The at_boot snapshot has no entry list.
5079 */
5080 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t);
5081
5082 if (size_only) {
5083 return 0;
5084 }
5085
5086 /*
5087 * Validate the size of the snapshot buffer
5088 */
5089 if (input_size < *snapshot_size) {
5090 return EINVAL;
5091 }
5092
5093 /*
5094 * Update the notification_time only
5095 */
5096 memorystatus_at_boot_snapshot.notification_time = mach_absolute_time();
5097 *snapshot = &memorystatus_at_boot_snapshot;
5098
5099 MEMORYSTATUS_DEBUG(7, "memorystatus_get_at_boot_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%d)\n",
5100 (long)input_size, (long)*snapshot_size, 0);
5101 return 0;
5102}
5103
5104static int
5105memorystatus_get_on_demand_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only) {
5106 size_t input_size = *snapshot_size;
5107 uint32_t ods_list_count = memorystatus_list_count;
5108 memorystatus_jetsam_snapshot_t *ods = NULL; /* The on_demand snapshot buffer */
5109
5110 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (ods_list_count));
5111
5112 if (size_only) {
5113 return 0;
5114 }
5115
5116 /*
5117 * Validate the size of the snapshot buffer.
5118 * This is inherently racey. May want to revisit
5119 * this error condition and trim the output when
5120 * it doesn't fit.
5121 */
5122 if (input_size < *snapshot_size) {
5123 return EINVAL;
5124 }
5125
5126 /*
5127 * Allocate and initialize a snapshot buffer.
5128 */
5129 ods = (memorystatus_jetsam_snapshot_t *)kalloc(*snapshot_size);
5130 if (!ods) {
5131 return (ENOMEM);
5132 }
5133
5134 memset(ods, 0, *snapshot_size);
5135
5136 proc_list_lock();
5137 memorystatus_init_jetsam_snapshot_locked(ods, ods_list_count);
5138 proc_list_unlock();
5139
5140 /*
5141 * Return the kernel allocated, on_demand buffer.
5142 * The caller of this routine will copy the data out
5143 * to user space and then free the kernel allocated
5144 * buffer.
5145 */
5146 *snapshot = ods;
5147
5148 MEMORYSTATUS_DEBUG(7, "memorystatus_get_on_demand_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
5149 (long)input_size, (long)*snapshot_size, (long)ods_list_count);
316670eb 5150
3e170ce0
A
5151 return 0;
5152}
5153
5154static int
5155memorystatus_get_jetsam_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only) {
5156 size_t input_size = *snapshot_size;
5157
39236c6e
A
5158 if (memorystatus_jetsam_snapshot_count > 0) {
5159 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count));
5160 } else {
5161 *snapshot_size = 0;
5162 }
5163
5164 if (size_only) {
5165 return 0;
316670eb 5166 }
39236c6e
A
5167
5168 if (input_size < *snapshot_size) {
5169 return EINVAL;
5170 }
5171
5172 *snapshot = memorystatus_jetsam_snapshot;
3e170ce0
A
5173
5174 MEMORYSTATUS_DEBUG(7, "memorystatus_get_jetsam_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
5175 (long)input_size, (long)*snapshot_size, (long)memorystatus_jetsam_snapshot_count);
5176
39236c6e 5177 return 0;
316670eb
A
5178}
5179
fe8ab488 5180
316670eb 5181static int
3e170ce0 5182memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t buffer_size, int32_t *retval) {
39236c6e
A
5183 int error = EINVAL;
5184 boolean_t size_only;
3e170ce0
A
5185 boolean_t is_default_snapshot = FALSE;
5186 boolean_t is_on_demand_snapshot = FALSE;
5187 boolean_t is_at_boot_snapshot = FALSE;
39236c6e 5188 memorystatus_jetsam_snapshot_t *snapshot;
3e170ce0 5189
39236c6e 5190 size_only = ((buffer == USER_ADDR_NULL) ? TRUE : FALSE);
3e170ce0
A
5191
5192 if (flags == 0) {
5193 /* Default */
5194 is_default_snapshot = TRUE;
5195 error = memorystatus_get_jetsam_snapshot(&snapshot, &buffer_size, size_only);
5196 } else {
5197 if (flags & ~(MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT)) {
5198 /*
5199 * Unsupported bit set in flag.
5200 */
5201 return EINVAL;
5202 }
5203
5204 if ((flags & (MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT)) ==
5205 (MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT)) {
5206 /*
5207 * Can't have both set at the same time.
5208 */
5209 return EINVAL;
5210 }
5211
5212 if (flags & MEMORYSTATUS_SNAPSHOT_ON_DEMAND) {
5213 is_on_demand_snapshot = TRUE;
5214 /*
5215 * When not requesting the size only, the following call will allocate
5216 * an on_demand snapshot buffer, which is freed below.
5217 */
5218 error = memorystatus_get_on_demand_snapshot(&snapshot, &buffer_size, size_only);
5219
5220 } else if (flags & MEMORYSTATUS_SNAPSHOT_AT_BOOT) {
5221 is_at_boot_snapshot = TRUE;
5222 error = memorystatus_get_at_boot_snapshot(&snapshot, &buffer_size, size_only);
5223 } else {
5224 /*
5225 * Invalid flag setting.
5226 */
5227 return EINVAL;
5228 }
5229 }
5230
39236c6e
A
5231 if (error) {
5232 goto out;
5233 }
316670eb 5234
3e170ce0
A
5235 /*
5236 * Copy the data out to user space and clear the snapshot buffer.
5237 * If working with the jetsam snapshot,
5238 * clearing the buffer means, reset the count.
5239 * If working with an on_demand snapshot
5240 * clearing the buffer means, free it.
5241 * If working with the at_boot snapshot
5242 * there is nothing to clear or update.
5243 */
39236c6e
A
5244 if (!size_only) {
5245 if ((error = copyout(snapshot, buffer, buffer_size)) == 0) {
3e170ce0
A
5246 if (is_default_snapshot) {
5247 /*
5248 * The jetsam snapshot is never freed, its count is simply reset.
5249 */
5250 snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
5251
5252 proc_list_lock();
5253 memorystatus_jetsam_snapshot_last_timestamp = 0;
5254 proc_list_unlock();
5255 }
5256 }
5257
5258 if (is_on_demand_snapshot) {
5259 /*
5260 * The on_demand snapshot is always freed,
5261 * even if the copyout failed.
5262 */
5263 if(snapshot) {
5264 kfree(snapshot, buffer_size);
5265 }
39236c6e
A
5266 }
5267 }
316670eb 5268
39236c6e
A
5269 if (error == 0) {
5270 *retval = buffer_size;
5271 }
5272out:
5273 return error;
5274}
316670eb 5275
fe8ab488
A
5276/*
5277 * Routine: memorystatus_cmd_grp_set_properties
5278 * Purpose: Update properties for a group of processes.
5279 *
5280 * Supported Properties:
5281 * [priority]
5282 * Move each process out of its effective priority
5283 * band and into a new priority band.
5284 * Maintains relative order from lowest to highest priority.
5285 * In single band, maintains relative order from head to tail.
5286 *
5287 * eg: before [effectivepriority | pid]
5288 * [18 | p101 ]
5289 * [17 | p55, p67, p19 ]
5290 * [12 | p103 p10 ]
5291 * [ 7 | p25 ]
5292 * [ 0 | p71, p82, ]
5293 *
5294 * after [ new band | pid]
5295 * [ xxx | p71, p82, p25, p103, p10, p55, p67, p19, p101]
5296 *
5297 * Returns: 0 on success, else non-zero.
5298 *
5299 * Caveat: We know there is a race window regarding recycled pids.
5300 * A process could be killed before the kernel can act on it here.
5301 * If a pid cannot be found in any of the jetsam priority bands,
5302 * then we simply ignore it. No harm.
5303 * But, if the pid has been recycled then it could be an issue.
5304 * In that scenario, we might move an unsuspecting process to the new
5305 * priority band. It's not clear how the kernel can safeguard
5306 * against this, but it would be an extremely rare case anyway.
5307 * The caller of this api might avoid such race conditions by
5308 * ensuring that the processes passed in the pid list are suspended.
5309 */
5310
5311
5312/* This internal structure can expand when we add support for more properties */
5313typedef struct memorystatus_internal_properties
5314{
5315 proc_t proc;
5316 int32_t priority; /* see memorytstatus_priority_entry_t : priority */
5317} memorystatus_internal_properties_t;
5318
5319
5320static int
5321memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
5322
5323#pragma unused (flags)
5324
5325 /*
5326 * We only handle setting priority
5327 * per process
5328 */
5329
5330 int error = 0;
5331 memorystatus_priority_entry_t *entries = NULL;
5332 uint32_t entry_count = 0;
5333
5334 /* This will be the ordered proc list */
5335 memorystatus_internal_properties_t *table = NULL;
5336 size_t table_size = 0;
5337 uint32_t table_count = 0;
5338
5339 uint32_t i = 0;
5340 uint32_t bucket_index = 0;
5341 boolean_t head_insert;
5342 int32_t new_priority;
5343
5344 proc_t p;
5345
5346 /* Verify inputs */
5347 if ((buffer == USER_ADDR_NULL) || (buffer_size == 0) || ((buffer_size % sizeof(memorystatus_priority_entry_t)) != 0)) {
5348 error = EINVAL;
5349 goto out;
5350 }
5351
5352 entry_count = (buffer_size / sizeof(memorystatus_priority_entry_t));
5353 if ((entries = (memorystatus_priority_entry_t *)kalloc(buffer_size)) == NULL) {
5354 error = ENOMEM;
5355 goto out;
5356 }
5357
5358 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, entry_count, 0, 0, 0, 0);
5359
5360 if ((error = copyin(buffer, entries, buffer_size)) != 0) {
5361 goto out;
5362 }
5363
5364 /* Verify sanity of input priorities */
5365 for (i=0; i < entry_count; i++) {
5366 if (entries[i].priority == -1) {
5367 /* Use as shorthand for default priority */
5368 entries[i].priority = JETSAM_PRIORITY_DEFAULT;
5369 } else if (entries[i].priority == JETSAM_PRIORITY_IDLE_DEFERRED) {
5370 /* JETSAM_PRIORITY_IDLE_DEFERRED is reserved for internal use;
5371 * if requested, adjust to JETSAM_PRIORITY_IDLE. */
5372 entries[i].priority = JETSAM_PRIORITY_IDLE;
5373 } else if (entries[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
5374 /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle
5375 * queue */
5376 /* Deal with this later */
5377 } else if ((entries[i].priority < 0) || (entries[i].priority >= MEMSTAT_BUCKET_COUNT)) {
5378 /* Sanity check */
5379 error = EINVAL;
5380 goto out;
5381 }
5382 }
5383
5384 table_size = sizeof(memorystatus_internal_properties_t) * entry_count;
5385 if ( (table = (memorystatus_internal_properties_t *)kalloc(table_size)) == NULL) {
5386 error = ENOMEM;
5387 goto out;
5388 }
5389 memset(table, 0, table_size);
5390
5391
5392 /*
5393 * For each jetsam bucket entry, spin through the input property list.
5394 * When a matching pid is found, populate an adjacent table with the
5395 * appropriate proc pointer and new property values.
5396 * This traversal automatically preserves order from lowest
5397 * to highest priority.
5398 */
5399
5400 bucket_index=0;
5401
5402 proc_list_lock();
5403
5404 /* Create the ordered table */
5405 p = memorystatus_get_first_proc_locked(&bucket_index, TRUE);
5406 while (p && (table_count < entry_count)) {
5407 for (i=0; i < entry_count; i++ ) {
5408 if (p->p_pid == entries[i].pid) {
5409 /* Build the table data */
5410 table[table_count].proc = p;
5411 table[table_count].priority = entries[i].priority;
5412 table_count++;
5413 break;
5414 }
5415 }
5416 p = memorystatus_get_next_proc_locked(&bucket_index, p, TRUE);
5417 }
5418
5419 /* We now have ordered list of procs ready to move */
5420 for (i=0; i < table_count; i++) {
5421 p = table[i].proc;
5422 assert(p != NULL);
5423
5424 /* Allow head inserts -- but relative order is now */
5425 if (table[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
5426 new_priority = JETSAM_PRIORITY_IDLE;
5427 head_insert = true;
5428 } else {
5429 new_priority = table[i].priority;
5430 head_insert = false;
5431 }
5432
5433 /* Not allowed */
5434 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
5435 continue;
5436 }
5437
5438 /*
5439 * Take appropriate steps if moving proc out of the
5440 * JETSAM_PRIORITY_IDLE_DEFERRED band.
5441 */
5442 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
5443 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
5444 }
5445
5446 memorystatus_update_priority_locked(p, new_priority, head_insert);
5447 }
5448
5449 proc_list_unlock();
5450
5451 /*
5452 * if (table_count != entry_count)
5453 * then some pids were not found in a jetsam band.
5454 * harmless but interesting...
5455 */
5456 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, entry_count, table_count, 0, 0, 0);
5457
5458out:
5459 if (entries)
5460 kfree(entries, buffer_size);
5461 if (table)
5462 kfree(table, table_size);
5463
5464 return (error);
5465}
5466
5467
5468/*
3e170ce0
A
5469 * This routine is used to update a process's jetsam priority position and stored user_data.
5470 * It is not used for the setting of memory limits, which is why the last 6 args to the
5471 * memorystatus_update() call are 0 or FALSE.
fe8ab488
A
5472 */
5473
39236c6e
A
5474static int
5475memorystatus_cmd_set_priority_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
3e170ce0
A
5476 int error = 0;
5477 memorystatus_priority_properties_t mpp_entry;
5478
39236c6e 5479 /* Validate inputs */
3e170ce0 5480 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_priority_properties_t))) {
39236c6e
A
5481 return EINVAL;
5482 }
5483
3e170ce0
A
5484 error = copyin(buffer, &mpp_entry, buffer_size);
5485
5486 if (error == 0) {
39236c6e
A
5487 proc_t p;
5488
39236c6e
A
5489 p = proc_find(pid);
5490 if (!p) {
3e170ce0 5491 return ESRCH;
39236c6e
A
5492 }
5493
5494 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
39236c6e 5495 proc_rele(p);
3e170ce0 5496 return EPERM;
39236c6e 5497 }
fe8ab488 5498
3e170ce0 5499 error = memorystatus_update(p, mpp_entry.priority, mpp_entry.user_data, FALSE, FALSE, 0, 0, FALSE, FALSE, FALSE);
39236c6e
A
5500 proc_rele(p);
5501 }
5502
3e170ce0
A
5503 return(error);
5504}
5505
5506static int
5507memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
5508 int error = 0;
5509 memorystatus_memlimit_properties_t mmp_entry;
5510
5511 /* Validate inputs */
5512 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_memlimit_properties_t))) {
5513 return EINVAL;
5514 }
5515
5516 error = copyin(buffer, &mmp_entry, buffer_size);
5517
5518 if (error == 0) {
5519 error = memorystatus_set_memlimit_properties(pid, &mmp_entry);
5520 }
5521
5522 return(error);
5523}
5524
5525/*
5526 * When getting the memlimit settings, we can't simply call task_get_phys_footprint_limit().
5527 * That gets the proc's cached memlimit and there is no guarantee that the active/inactive
5528 * limits will be the same in the no-limit case. Instead we convert limits <= 0 using
5529 * task_convert_phys_footprint_limit(). It computes the same limit value that would be written
5530 * to the task's ledgers via task_set_phys_footprint_limit().
5531 */
5532static int
5533memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
5534 int error = 0;
5535 memorystatus_memlimit_properties_t mmp_entry;
5536
5537 /* Validate inputs */
5538 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_memlimit_properties_t))) {
5539 return EINVAL;
5540 }
5541
5542 memset (&mmp_entry, 0, sizeof(memorystatus_memlimit_properties_t));
5543
5544 proc_t p = proc_find(pid);
5545 if (!p) {
5546 return ESRCH;
5547 }
5548
5549 /*
5550 * Get the active limit and attributes.
5551 * No locks taken since we hold a reference to the proc.
5552 */
5553
5554 if (p->p_memstat_memlimit_active > 0 ) {
5555 mmp_entry.memlimit_active = p->p_memstat_memlimit_active;
5556 } else {
5557 task_convert_phys_footprint_limit(-1, &mmp_entry.memlimit_active);
5558 }
5559
5560 if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL) {
5561 mmp_entry.memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
5562 }
5563
5564 /*
5565 * Get the inactive limit and attributes
5566 */
5567 if (p->p_memstat_memlimit_inactive <= 0) {
5568 task_convert_phys_footprint_limit(-1, &mmp_entry.memlimit_inactive);
5569 } else {
5570 mmp_entry.memlimit_inactive = p->p_memstat_memlimit_inactive;
5571 }
5572 if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) {
5573 mmp_entry.memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
5574 }
5575 proc_rele(p);
5576
5577 error = copyout(&mmp_entry, buffer, buffer_size);
5578
5579 return(error);
b0d623f7
A
5580}
5581
3e170ce0 5582
39236c6e
A
5583static int
5584memorystatus_cmd_get_pressure_status(int32_t *retval) {
5585 int error;
5586
5587 /* Need privilege for check */
5588 error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
5589 if (error) {
5590 return (error);
5591 }
5592
5593 /* Inherently racy, so it's not worth taking a lock here */
5594 *retval = (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
5595
5596 return error;
5597}
316670eb 5598
3e170ce0
A
5599int
5600memorystatus_get_pressure_status_kdp() {
5601 return (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
5602}
5603
fe8ab488
A
5604/*
5605 * Every process, including a P_MEMSTAT_INTERNAL process (currently only pid 1), is allowed to set a HWM.
3e170ce0
A
5606 *
5607 * This call is inflexible -- it does not distinguish between active/inactive, fatal/non-fatal
5608 * So, with 2-level HWM preserving previous behavior will map as follows.
5609 * - treat the limit passed in as both an active and inactive limit.
5610 * - treat the is_fatal_limit flag as though it applies to both active and inactive limits.
5611 *
5612 * When invoked via MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK
5613 * - the is_fatal_limit is FALSE, meaning the active and inactive limits are non-fatal/soft
5614 * - so mapping is (active/non-fatal, inactive/non-fatal)
5615 *
5616 * When invoked via MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT
5617 * - the is_fatal_limit is TRUE, meaning the process's active and inactive limits are fatal/hard
5618 * - so mapping is (active/fatal, inactive/fatal)
fe8ab488
A
5619 */
5620
b0d623f7 5621static int
fe8ab488 5622memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit) {
39236c6e 5623 int error = 0;
3e170ce0
A
5624 memorystatus_memlimit_properties_t entry;
5625
5626 entry.memlimit_active = high_water_mark;
5627 entry.memlimit_active_attr = 0;
5628 entry.memlimit_inactive = high_water_mark;
5629 entry.memlimit_inactive_attr = 0;
5630
5631 if (is_fatal_limit == TRUE) {
5632 entry.memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
5633 entry.memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
5634 }
5635
5636 error = memorystatus_set_memlimit_properties(pid, &entry);
5637 return (error);
5638}
5639
5640static int
5641memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry) {
5642
5643 int32_t memlimit_active;
5644 boolean_t memlimit_active_is_fatal;
5645 int32_t memlimit_inactive;
5646 boolean_t memlimit_inactive_is_fatal;
5647 uint32_t valid_attrs = 0;
5648 int error = 0;
39236c6e
A
5649
5650 proc_t p = proc_find(pid);
5651 if (!p) {
5652 return ESRCH;
5653 }
3e170ce0
A
5654
5655 /*
5656 * Check for valid attribute flags.
5657 */
5658 valid_attrs |= (MEMORYSTATUS_MEMLIMIT_ATTR_FATAL);
5659 if ((entry->memlimit_active_attr & (~valid_attrs)) != 0) {
5660 proc_rele(p);
5661 return EINVAL;
5662 }
5663 if ((entry->memlimit_inactive_attr & (~valid_attrs)) != 0) {
5664 proc_rele(p);
5665 return EINVAL;
39236c6e 5666 }
fe8ab488 5667
3e170ce0
A
5668 /*
5669 * Setup the active memlimit properties
5670 */
5671 memlimit_active = entry->memlimit_active;
5672 if (entry->memlimit_active_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL) {
5673 memlimit_active_is_fatal = TRUE;
5674 } else {
5675 memlimit_active_is_fatal = FALSE;
5676 }
fe8ab488 5677
3e170ce0
A
5678 /*
5679 * Setup the inactive memlimit properties
5680 */
5681 memlimit_inactive = entry->memlimit_inactive;
5682 if (entry->memlimit_inactive_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL) {
5683 memlimit_inactive_is_fatal = TRUE;
5684 } else {
5685 memlimit_inactive_is_fatal = FALSE;
39236c6e
A
5686 }
5687
3e170ce0
A
5688 /*
5689 * Setting a limit of <= 0 implies that the process has no
5690 * high-water-mark and has no per-task-limit. That means
5691 * the system_wide task limit is in place, which by the way,
5692 * is always fatal.
5693 */
5694
5695 if (memlimit_active <= 0) {
5696 /*
5697 * Enforce the fatal system_wide task limit while process is active.
5698 */
5699 memlimit_active = -1;
5700 memlimit_active_is_fatal = TRUE;
5701 }
5702
5703 if (memlimit_inactive <= 0) {
5704 /*
5705 * Enforce the fatal system_wide task limit while process is inactive.
5706 */
5707 memlimit_inactive = -1;
5708 memlimit_inactive_is_fatal = TRUE;
5709 }
5710
5711 proc_list_lock();
5712
5713 /*
5714 * Store the active limit variants in the proc.
5715 */
5716 SET_ACTIVE_LIMITS_LOCKED(p, memlimit_active, memlimit_active_is_fatal);
5717
5718 /*
5719 * Store the inactive limit variants in the proc.
5720 */
5721 SET_INACTIVE_LIMITS_LOCKED(p, memlimit_inactive, memlimit_inactive_is_fatal);
5722
5723 /*
5724 * Enforce appropriate limit variant by updating the cached values
5725 * and writing the ledger.
5726 * Limit choice is based on process active/inactive state.
5727 */
5728
5729 if (memorystatus_highwater_enabled) {
5730 boolean_t trigger_exception;
5731 /*
5732 * No need to consider P_MEMSTAT_MEMLIMIT_BACKGROUND anymore.
5733 * Background limits are described via the inactive limit slots.
5734 */
5735
5736 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
5737 CACHE_ACTIVE_LIMITS_LOCKED(p, trigger_exception);
fe8ab488 5738 } else {
3e170ce0 5739 CACHE_INACTIVE_LIMITS_LOCKED(p, trigger_exception);
fe8ab488 5740 }
3e170ce0
A
5741
5742 /* Enforce the limit by writing to the ledgers */
5743 assert(trigger_exception == TRUE);
5744 error = (task_set_phys_footprint_limit_internal(p->task, ((p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1), NULL, trigger_exception) == 0) ? 0 : EINVAL;
5745
5746 MEMORYSTATUS_DEBUG(3, "memorystatus_set_memlimit_properties: new limit on pid %d (%dMB %s) current priority (%d) dirty_state?=0x%x %s\n",
5747 p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
5748 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, p->p_memstat_dirty,
5749 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
fe8ab488
A
5750 }
5751
39236c6e
A
5752 proc_list_unlock();
5753 proc_rele(p);
5754
5755 return error;
5756}
5757
fe8ab488
A
5758/*
5759 * Returns the jetsam priority (effective or requested) of the process
5760 * associated with this task.
5761 */
5762int
5763proc_get_memstat_priority(proc_t p, boolean_t effective_priority)
5764{
5765 if (p) {
5766 if (effective_priority) {
5767 return p->p_memstat_effectivepriority;
5768 } else {
5769 return p->p_memstat_requestedpriority;
5770 }
5771 }
5772 return 0;
5773}
3e170ce0
A
5774
5775/*
5776 * Description:
5777 * Evaluates active vs. inactive process state.
5778 * Processes that opt into dirty tracking are evaluated
5779 * based on clean vs dirty state.
5780 * dirty ==> active
5781 * clean ==> inactive
5782 *
5783 * Process that do not opt into dirty tracking are
5784 * evalulated based on priority level.
5785 * Foreground or above ==> active
5786 * Below Foreground ==> inactive
5787 *
5788 * Return: TRUE if active
5789 * False if inactive
5790 */
5791
5792static boolean_t
5793proc_jetsam_state_is_active_locked(proc_t p) {
5794
5795 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
5796 /*
5797 * process has opted into dirty tracking
5798 * active state is based on dirty vs. clean
5799 */
5800 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
5801 /*
5802 * process is dirty
5803 * implies active state
5804 */
5805 return TRUE;
5806 } else {
5807 /*
5808 * process is clean
5809 * implies inactive state
5810 */
5811 return FALSE;
5812 }
5813 } else if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) {
5814 /*
5815 * process is Foreground or higher
5816 * implies active state
5817 */
5818 return TRUE;
5819 } else {
5820 /*
5821 * process found below Foreground
5822 * implies inactive state
5823 */
5824 return FALSE;
5825 }
5826}
5827
39236c6e 5828#endif /* CONFIG_JETSAM */
b0d623f7 5829
39236c6e
A
5830int
5831memorystatus_control(struct proc *p __unused, struct memorystatus_control_args *args, int *ret) {
5832 int error = EINVAL;
5833
5834#if !CONFIG_JETSAM
5835 #pragma unused(ret)
5836#endif
5837
5838 /* Root only for now */
5839 if (!kauth_cred_issuser(kauth_cred_get())) {
5840 error = EPERM;
5841 goto out;
b0d623f7 5842 }
39236c6e
A
5843
5844 /* Sanity check */
5845 if (args->buffersize > MEMORYSTATUS_BUFFERSIZE_MAX) {
5846 error = EINVAL;
5847 goto out;
5848 }
5849
5850 switch (args->command) {
5851 case MEMORYSTATUS_CMD_GET_PRIORITY_LIST:
5852 error = memorystatus_cmd_get_priority_list(args->buffer, args->buffersize, ret);
5853 break;
5854#if CONFIG_JETSAM
5855 case MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES:
5856 error = memorystatus_cmd_set_priority_properties(args->pid, args->buffer, args->buffersize, ret);
5857 break;
3e170ce0
A
5858 case MEMORYSTATUS_CMD_SET_MEMLIMIT_PROPERTIES:
5859 error = memorystatus_cmd_set_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
5860 break;
5861 case MEMORYSTATUS_CMD_GET_MEMLIMIT_PROPERTIES:
5862 error = memorystatus_cmd_get_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
5863 break;
fe8ab488
A
5864 case MEMORYSTATUS_CMD_GRP_SET_PROPERTIES:
5865 error = memorystatus_cmd_grp_set_properties((int32_t)args->flags, args->buffer, args->buffersize, ret);
5866 break;
39236c6e 5867 case MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT:
3e170ce0 5868 error = memorystatus_cmd_get_jetsam_snapshot((int32_t)args->flags, args->buffer, args->buffersize, ret);
39236c6e
A
5869 break;
5870 case MEMORYSTATUS_CMD_GET_PRESSURE_STATUS:
5871 error = memorystatus_cmd_get_pressure_status(ret);
5872 break;
5873 case MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK:
3e170ce0
A
5874 /*
5875 * This call does not distinguish between active and inactive limits.
5876 * Default behavior in 2-level HWM world is to set both.
5877 * Non-fatal limit is also assumed for both.
5878 */
fe8ab488
A
5879 error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, FALSE);
5880 break;
5881 case MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT:
3e170ce0
A
5882 /*
5883 * This call does not distinguish between active and inactive limits.
5884 * Default behavior in 2-level HWM world is to set both.
5885 * Fatal limit is also assumed for both.
5886 */
fe8ab488 5887 error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, TRUE);
39236c6e
A
5888 break;
5889 /* Test commands */
5890#if DEVELOPMENT || DEBUG
5891 case MEMORYSTATUS_CMD_TEST_JETSAM:
5892 error = memorystatus_kill_process_sync(args->pid, kMemorystatusKilled) ? 0 : EINVAL;
5893 break;
3e170ce0
A
5894 case MEMORYSTATUS_CMD_TEST_JETSAM_SORT:
5895 error = memorystatus_cmd_test_jetsam_sort(args->pid, (int32_t)args->flags);
5896 break;
39236c6e
A
5897 case MEMORYSTATUS_CMD_SET_JETSAM_PANIC_BITS:
5898 error = memorystatus_cmd_set_panic_bits(args->buffer, args->buffersize);
5899 break;
5900#endif /* DEVELOPMENT || DEBUG */
5901#endif /* CONFIG_JETSAM */
3e170ce0
A
5902 case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE:
5903 case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE:
5904 error = memorystatus_low_mem_privileged_listener(args->command);
5905 break;
39236c6e
A
5906 default:
5907 break;
5908 }
5909
5910out:
5911 return error;
5912}
5913
5914
5915static int
5916filt_memorystatusattach(struct knote *kn)
5917{
5918 kn->kn_flags |= EV_CLEAR;
5919 return memorystatus_knote_register(kn);
5920}
5921
5922static void
5923filt_memorystatusdetach(struct knote *kn)
5924{
5925 memorystatus_knote_unregister(kn);
5926}
5927
5928static int
5929filt_memorystatus(struct knote *kn __unused, long hint)
5930{
5931 if (hint) {
5932 switch (hint) {
5933 case kMemorystatusNoPressure:
5934 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
3e170ce0 5935 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
39236c6e
A
5936 }
5937 break;
5938 case kMemorystatusPressure:
5939 if (memorystatus_vm_pressure_level == kVMPressureWarning || memorystatus_vm_pressure_level == kVMPressureUrgent) {
5940 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
3e170ce0 5941 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
39236c6e
A
5942 }
5943 } else if (memorystatus_vm_pressure_level == kVMPressureCritical) {
5944
5945 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
3e170ce0 5946 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
39236c6e
A
5947 }
5948 }
5949 break;
fe8ab488
A
5950 case kMemorystatusLowSwap:
5951 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) {
3e170ce0 5952 kn->kn_fflags = NOTE_MEMORYSTATUS_LOW_SWAP;
fe8ab488
A
5953 }
5954 break;
39236c6e
A
5955 default:
5956 break;
b0d623f7 5957 }
39236c6e
A
5958 }
5959
5960 return (kn->kn_fflags != 0);
5961}
5962
5963static void
5964memorystatus_klist_lock(void) {
5965 lck_mtx_lock(&memorystatus_klist_mutex);
5966}
5967
5968static void
5969memorystatus_klist_unlock(void) {
5970 lck_mtx_unlock(&memorystatus_klist_mutex);
5971}
5972
5973void
5974memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr) {
5975 lck_mtx_init(&memorystatus_klist_mutex, grp, attr);
5976 klist_init(&memorystatus_klist);
5977}
5978
5979int
5980memorystatus_knote_register(struct knote *kn) {
5981 int error = 0;
5982
5983 memorystatus_klist_lock();
5984
fe8ab488 5985 if (kn->kn_sfflags & (NOTE_MEMORYSTATUS_PRESSURE_NORMAL | NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL | NOTE_MEMORYSTATUS_LOW_SWAP)) {
39236c6e 5986
3e170ce0 5987 KNOTE_ATTACH(&memorystatus_klist, kn);
39236c6e 5988
39236c6e
A
5989 } else {
5990 error = ENOTSUP;
b0d623f7 5991 }
39236c6e
A
5992
5993 memorystatus_klist_unlock();
5994
5995 return error;
b0d623f7
A
5996}
5997
39236c6e
A
5998void
5999memorystatus_knote_unregister(struct knote *kn __unused) {
6000 memorystatus_klist_lock();
6001 KNOTE_DETACH(&memorystatus_klist, kn);
6002 memorystatus_klist_unlock();
6003}
316670eb 6004
fe8ab488
A
6005
6006#if 0
39236c6e
A
6007#if CONFIG_JETSAM && VM_PRESSURE_EVENTS
6008static boolean_t
6009memorystatus_issue_pressure_kevent(boolean_t pressured) {
6010 memorystatus_klist_lock();
6011 KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure);
6012 memorystatus_klist_unlock();
6013 return TRUE;
6014}
39236c6e 6015#endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
fe8ab488 6016#endif /* 0 */
3e170ce0
A
6017
6018#if CONFIG_JETSAM
6019/* Coalition support */
6020
6021/* sorting info for a particular priority bucket */
6022typedef struct memstat_sort_info {
6023 coalition_t msi_coal;
6024 uint64_t msi_page_count;
6025 pid_t msi_pid;
6026 int msi_ntasks;
6027} memstat_sort_info_t;
6028
6029/*
6030 * qsort from smallest page count to largest page count
6031 *
6032 * return < 0 for a < b
6033 * 0 for a == b
6034 * > 0 for a > b
6035 */
6036static int memstat_asc_cmp(const void *a, const void *b)
6037{
6038 const memstat_sort_info_t *msA = (const memstat_sort_info_t *)a;
6039 const memstat_sort_info_t *msB = (const memstat_sort_info_t *)b;
6040
6041 return (int)((uint64_t)msA->msi_page_count - (uint64_t)msB->msi_page_count);
6042}
6043
6044/*
6045 * Return the number of pids rearranged during this sort.
6046 */
6047static int
6048memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coal_sort_order)
6049{
6050#define MAX_SORT_PIDS 80
6051#define MAX_COAL_LEADERS 10
6052
6053 unsigned int b = bucket_index;
6054 int nleaders = 0;
6055 int ntasks = 0;
6056 proc_t p = NULL;
6057 coalition_t coal = COALITION_NULL;
6058 int pids_moved = 0;
6059 int total_pids_moved = 0;
6060 int i;
6061
6062 /*
6063 * The system is typically under memory pressure when in this
6064 * path, hence, we want to avoid dynamic memory allocation.
6065 */
6066 memstat_sort_info_t leaders[MAX_COAL_LEADERS];
6067 pid_t pid_list[MAX_SORT_PIDS];
6068
6069 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
6070 return(0);
6071 }
6072
6073 /*
6074 * Clear the array that holds coalition leader information
6075 */
6076 for (i=0; i < MAX_COAL_LEADERS; i++) {
6077 leaders[i].msi_coal = COALITION_NULL;
6078 leaders[i].msi_page_count = 0; /* will hold total coalition page count */
6079 leaders[i].msi_pid = 0; /* will hold coalition leader pid */
6080 leaders[i].msi_ntasks = 0; /* will hold the number of tasks in a coalition */
6081 }
6082
6083 p = memorystatus_get_first_proc_locked(&b, FALSE);
6084 while (p) {
6085 if (coalition_is_leader(p->task, COALITION_TYPE_JETSAM, &coal)) {
6086 if (nleaders < MAX_COAL_LEADERS) {
6087 int coal_ntasks = 0;
6088 uint64_t coal_page_count = coalition_get_page_count(coal, &coal_ntasks);
6089 leaders[nleaders].msi_coal = coal;
6090 leaders[nleaders].msi_page_count = coal_page_count;
6091 leaders[nleaders].msi_pid = p->p_pid; /* the coalition leader */
6092 leaders[nleaders].msi_ntasks = coal_ntasks;
6093 nleaders++;
6094 } else {
6095 /*
6096 * We've hit MAX_COAL_LEADERS meaning we can handle no more coalitions.
6097 * Abandoned coalitions will linger at the tail of the priority band
6098 * when this sort session ends.
6099 * TODO: should this be an assert?
6100 */
6101 printf("%s: WARNING: more than %d leaders in priority band [%d]\n",
6102 __FUNCTION__, MAX_COAL_LEADERS, bucket_index);
6103 break;
6104 }
6105 }
6106 p=memorystatus_get_next_proc_locked(&b, p, FALSE);
6107 }
6108
6109 if (nleaders == 0) {
6110 /* Nothing to sort */
6111 return(0);
6112 }
6113
6114 /*
6115 * Sort the coalition leader array, from smallest coalition page count
6116 * to largest coalition page count. When inserted in the priority bucket,
6117 * smallest coalition is handled first, resulting in the last to be jetsammed.
6118 */
6119 if (nleaders > 1) {
6120 qsort(leaders, nleaders, sizeof(memstat_sort_info_t), memstat_asc_cmp);
6121 }
6122
6123#if 0
6124 for (i = 0; i < nleaders; i++) {
6125 printf("%s: coal_leader[%d of %d] pid[%d] pages[%llu] ntasks[%d]\n",
6126 __FUNCTION__, i, nleaders, leaders[i].msi_pid, leaders[i].msi_page_count,
6127 leaders[i].msi_ntasks);
6128 }
6129#endif
6130
6131 /*
6132 * During coalition sorting, processes in a priority band are rearranged
6133 * by being re-inserted at the head of the queue. So, when handling a
6134 * list, the first process that gets moved to the head of the queue,
6135 * ultimately gets pushed toward the queue tail, and hence, jetsams last.
6136 *
6137 * So, for example, the coalition leader is expected to jetsam last,
6138 * after its coalition members. Therefore, the coalition leader is
6139 * inserted at the head of the queue first.
6140 *
6141 * After processing a coalition, the jetsam order is as follows:
6142 * undefs(jetsam first), extensions, xpc services, leader(jetsam last)
6143 */
6144
6145 /*
6146 * Coalition members are rearranged in the priority bucket here,
6147 * based on their coalition role.
6148 */
6149 total_pids_moved = 0;
6150 for (i=0; i < nleaders; i++) {
6151
6152 /* a bit of bookkeeping */
6153 pids_moved = 0;
6154
6155 /* Coalition leaders are jetsammed last, so move into place first */
6156 pid_list[0] = leaders[i].msi_pid;
6157 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list, 1);
6158
6159 /* xpc services should jetsam after extensions */
6160 ntasks = coalition_get_pid_list (leaders[i].msi_coal, COALITION_ROLEMASK_XPC,
6161 coal_sort_order, pid_list, MAX_SORT_PIDS);
6162
6163 if (ntasks > 0) {
6164 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
6165 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
6166 }
6167
6168 /* extensions should jetsam after unmarked processes */
6169 ntasks = coalition_get_pid_list (leaders[i].msi_coal, COALITION_ROLEMASK_EXT,
6170 coal_sort_order, pid_list, MAX_SORT_PIDS);
6171
6172 if (ntasks > 0) {
6173 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
6174 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
6175 }
6176
6177 /* undefined coalition members should be the first to jetsam */
6178 ntasks = coalition_get_pid_list (leaders[i].msi_coal, COALITION_ROLEMASK_UNDEF,
6179 coal_sort_order, pid_list, MAX_SORT_PIDS);
6180
6181 if (ntasks > 0) {
6182 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
6183 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
6184 }
6185
6186#if 0
6187 if (pids_moved == leaders[i].msi_ntasks) {
6188 /*
6189 * All the pids in the coalition were found in this band.
6190 */
6191 printf("%s: pids_moved[%d] equal total coalition ntasks[%d] \n", __FUNCTION__,
6192 pids_moved, leaders[i].msi_ntasks);
6193 } else if (pids_moved > leaders[i].msi_ntasks) {
6194 /*
6195 * Apparently new coalition members showed up during the sort?
6196 */
6197 printf("%s: pids_moved[%d] were greater than expected coalition ntasks[%d] \n", __FUNCTION__,
6198 pids_moved, leaders[i].msi_ntasks);
6199 } else {
6200 /*
6201 * Apparently not all the pids in the coalition were found in this band?
6202 */
6203 printf("%s: pids_moved[%d] were less than expected coalition ntasks[%d] \n", __FUNCTION__,
6204 pids_moved, leaders[i].msi_ntasks);
6205 }
6206#endif
6207
6208 total_pids_moved += pids_moved;
6209
6210 } /* end for */
6211
6212 return(total_pids_moved);
6213}
6214
6215
6216/*
6217 * Traverse a list of pids, searching for each within the priority band provided.
6218 * If pid is found, move it to the front of the priority band.
6219 * Never searches outside the priority band provided.
6220 *
6221 * Input:
6222 * bucket_index - jetsam priority band.
6223 * pid_list - pointer to a list of pids.
6224 * list_sz - number of pids in the list.
6225 *
6226 * Pid list ordering is important in that,
6227 * pid_list[n] is expected to jetsam ahead of pid_list[n+1].
6228 * The sort_order is set by the coalition default.
6229 *
6230 * Return:
6231 * the number of pids found and hence moved within the priority band.
6232 */
6233static int
6234memorystatus_move_list_locked(unsigned int bucket_index, pid_t *pid_list, int list_sz)
6235{
6236 memstat_bucket_t *current_bucket;
6237 int i;
6238 int found_pids = 0;
6239
6240 if ((pid_list == NULL) || (list_sz <= 0)) {
6241 return(0);
6242 }
6243
6244 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
6245 return(0);
6246 }
6247
6248 current_bucket = &memstat_bucket[bucket_index];
6249 for (i=0; i < list_sz; i++) {
6250 unsigned int b = bucket_index;
6251 proc_t p = NULL;
6252 proc_t aProc = NULL;
6253 pid_t aPid;
6254 int list_index;
6255
6256 list_index = ((list_sz - 1) - i);
6257 aPid = pid_list[list_index];
6258
6259 /* never search beyond bucket_index provided */
6260 p = memorystatus_get_first_proc_locked(&b, FALSE);
6261 while (p) {
6262 if (p->p_pid == aPid) {
6263 aProc = p;
6264 break;
6265 }
6266 p = memorystatus_get_next_proc_locked(&b, p, FALSE);
6267 }
6268
6269 if (aProc == NULL) {
6270 /* pid not found in this band, just skip it */
6271 continue;
6272 } else {
6273 TAILQ_REMOVE(&current_bucket->list, aProc, p_memstat_list);
6274 TAILQ_INSERT_HEAD(&current_bucket->list, aProc, p_memstat_list);
6275 found_pids++;
6276 }
6277 }
6278 return(found_pids);
6279}
6280#endif /* CONFIG_JETSAM */