]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/kern_memorystatus.c
xnu-2782.20.48.tar.gz
[apple/xnu.git] / bsd / kern / kern_memorystatus.c
CommitLineData
2d21ac55
A
1/*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
2d21ac55 29
2d21ac55 30#include <kern/sched_prim.h>
6d2010ae 31#include <kern/kalloc.h>
316670eb 32#include <kern/assert.h>
6d2010ae 33#include <kern/debug.h>
fe8ab488 34#include <kern/locks.h>
2d21ac55
A
35#include <kern/task.h>
36#include <kern/thread.h>
316670eb 37#include <kern/host.h>
2d21ac55 38#include <libkern/libkern.h>
316670eb 39#include <mach/mach_time.h>
b0d623f7 40#include <mach/task.h>
316670eb 41#include <mach/host_priv.h>
39236c6e
A
42#include <mach/mach_host.h>
43#include <pexpert/pexpert.h>
316670eb 44#include <sys/kern_event.h>
b0d623f7 45#include <sys/proc.h>
39236c6e 46#include <sys/proc_info.h>
b0d623f7
A
47#include <sys/signal.h>
48#include <sys/signalvar.h>
2d21ac55 49#include <sys/sysctl.h>
316670eb 50#include <sys/sysproto.h>
b0d623f7 51#include <sys/wait.h>
6d2010ae 52#include <sys/tree.h>
316670eb 53#include <sys/priv.h>
39236c6e
A
54#include <vm/vm_pageout.h>
55#include <vm/vm_protos.h>
6d2010ae
A
56
57#if CONFIG_FREEZE
6d2010ae 58#include <vm/vm_map.h>
39236c6e 59#endif /* CONFIG_FREEZE */
6d2010ae 60
316670eb 61#include <sys/kern_memorystatus.h>
6d2010ae 62
fe8ab488
A
63#if CONFIG_JETSAM
64/* For logging clarity */
65static const char *jetsam_kill_cause_name[] = {
66 "" ,
67 "jettisoned" , /* kMemorystatusKilled */
68 "highwater" , /* kMemorystatusKilledHiwat */
69 "vnode-limit" , /* kMemorystatusKilledVnodes */
70 "vm-pageshortage" , /* kMemorystatusKilledVMPageShortage */
71 "vm-thrashing" , /* kMemorystatusKilledVMThrashing */
72 "fc-thrashing" , /* kMemorystatusKilledFCThrashing */
73 "per-process-limit" , /* kMemorystatusKilledPerProcessLimit */
74 "diagnostic" , /* kMemorystatusKilledDiagnostic */
75 "idle-exit" , /* kMemorystatusKilledIdleExit */
76};
77
78/* Does cause indicate vm or fc thrashing? */
79static boolean_t
80is_thrashing(unsigned cause)
81{
82 switch (cause) {
83 case kMemorystatusKilledVMThrashing:
84 case kMemorystatusKilledFCThrashing:
85 return TRUE;
86 default:
87 return FALSE;
88 }
89}
90
91/* Callback into vm_compressor.c to signal that thrashing has been mitigated. */
92extern void vm_thrashing_jetsam_done(void);
93#endif
94
316670eb
A
95/* These are very verbose printfs(), enable with
96 * MEMORYSTATUS_DEBUG_LOG
97 */
98#if MEMORYSTATUS_DEBUG_LOG
99#define MEMORYSTATUS_DEBUG(cond, format, ...) \
100do { \
101 if (cond) { printf(format, ##__VA_ARGS__); } \
102} while(0)
103#else
104#define MEMORYSTATUS_DEBUG(cond, format, ...)
105#endif
6d2010ae 106
39236c6e
A
107/* General tunables */
108
109unsigned long delta_percentage = 5;
110unsigned long critical_threshold_percentage = 5;
111unsigned long idle_offset_percentage = 5;
112unsigned long pressure_threshold_percentage = 15;
113unsigned long freeze_threshold_percentage = 50;
114
316670eb 115/* General memorystatus stuff */
6d2010ae 116
39236c6e
A
117struct klist memorystatus_klist;
118static lck_mtx_t memorystatus_klist_mutex;
6d2010ae 119
39236c6e
A
120static void memorystatus_klist_lock(void);
121static void memorystatus_klist_unlock(void);
6d2010ae 122
39236c6e
A
123static uint64_t memorystatus_idle_delay_time = 0;
124
125/*
126 * Memorystatus kevents
127 */
128
129static int filt_memorystatusattach(struct knote *kn);
130static void filt_memorystatusdetach(struct knote *kn);
131static int filt_memorystatus(struct knote *kn, long hint);
132
133struct filterops memorystatus_filtops = {
134 .f_attach = filt_memorystatusattach,
135 .f_detach = filt_memorystatusdetach,
136 .f_event = filt_memorystatus,
137};
138
139enum {
fe8ab488
A
140 kMemorystatusNoPressure = 0x1,
141 kMemorystatusPressure = 0x2,
142 kMemorystatusLowSwap = 0x4
39236c6e
A
143};
144
145/* Idle guard handling */
146
147static int32_t memorystatus_scheduled_idle_demotions = 0;
148
149static thread_call_t memorystatus_idle_demotion_call;
150
151static void memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2);
152static void memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state);
153static void memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clean_state);
154static void memorystatus_reschedule_idle_demotion_locked(void);
6d2010ae 155
fe8ab488
A
156static void memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert);
157
158boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
159void memorystatus_send_low_swap_note(void);
39236c6e
A
160
161int memorystatus_wakeup = 0;
162
163unsigned int memorystatus_level = 0;
6d2010ae 164
316670eb 165static int memorystatus_list_count = 0;
6d2010ae 166
39236c6e 167#define MEMSTAT_BUCKET_COUNT (JETSAM_PRIORITY_MAX + 1)
6d2010ae 168
39236c6e
A
169typedef struct memstat_bucket {
170 TAILQ_HEAD(, proc) list;
171 int count;
172} memstat_bucket_t;
6d2010ae 173
39236c6e
A
174memstat_bucket_t memstat_bucket[MEMSTAT_BUCKET_COUNT];
175
176uint64_t memstat_idle_demotion_deadline = 0;
6d2010ae 177
316670eb 178static unsigned int memorystatus_dirty_count = 0;
6d2010ae 179
39236c6e
A
180
181int
182memorystatus_get_level(__unused struct proc *p, struct memorystatus_get_level_args *args, __unused int *ret)
183{
184 user_addr_t level = 0;
185
186 level = args->level;
187
188 if (copyout(&memorystatus_level, level, sizeof(memorystatus_level)) != 0) {
189 return EFAULT;
190 }
191
192 return 0;
193}
194
195static proc_t memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search);
196static proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search);
197
198static void memorystatus_thread(void *param __unused, wait_result_t wr __unused);
6d2010ae 199
316670eb
A
200/* Jetsam */
201
202#if CONFIG_JETSAM
203
fe8ab488
A
204int proc_get_memstat_priority(proc_t, boolean_t);
205
39236c6e
A
206/* Kill processes exceeding their limit either under memory pressure (1), or as soon as possible (0) */
207#define LEGACY_HIWATER 1
208
fe8ab488 209static boolean_t memorystatus_idle_snapshot = 0;
39236c6e 210
fe8ab488 211static int memorystatus_highwater_enabled = 1;
316670eb 212
316670eb
A
213unsigned int memorystatus_delta = 0;
214
39236c6e 215static unsigned int memorystatus_available_pages_critical_base = 0;
fe8ab488 216//static unsigned int memorystatus_last_foreground_pressure_pages = (unsigned int)-1;
39236c6e 217static unsigned int memorystatus_available_pages_critical_idle_offset = 0;
316670eb 218
39236c6e
A
219#if DEVELOPMENT || DEBUG
220static unsigned int memorystatus_jetsam_panic_debug = 0;
316670eb 221
39236c6e
A
222static unsigned int memorystatus_jetsam_policy = kPolicyDefault;
223static unsigned int memorystatus_jetsam_policy_offset_pages_diagnostic = 0;
224#endif
316670eb 225
fe8ab488
A
226static unsigned int memorystatus_thread_wasted_wakeup = 0;
227
228static uint32_t kill_under_pressure_cause = 0;
316670eb 229
39236c6e
A
230static memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot;
231#define memorystatus_jetsam_snapshot_list memorystatus_jetsam_snapshot->entries
316670eb 232
39236c6e
A
233static unsigned int memorystatus_jetsam_snapshot_count = 0;
234static unsigned int memorystatus_jetsam_snapshot_max = 0;
316670eb 235
39236c6e 236static void memorystatus_clear_errors(void);
fe8ab488 237static void memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages);
39236c6e
A
238static uint32_t memorystatus_build_state(proc_t p);
239static void memorystatus_update_levels_locked(boolean_t critical_only);
fe8ab488 240//static boolean_t memorystatus_issue_pressure_kevent(boolean_t pressured);
39236c6e
A
241
242static boolean_t memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause);
243static boolean_t memorystatus_kill_top_process(boolean_t any, uint32_t cause, int32_t *priority, uint32_t *errors);
244#if LEGACY_HIWATER
245static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors);
246#endif
247
248static boolean_t memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause);
249static boolean_t memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause);
316670eb 250
39236c6e 251#endif /* CONFIG_JETSAM */
6d2010ae 252
316670eb 253/* VM pressure */
6d2010ae 254
fe8ab488
A
255extern unsigned int vm_page_free_count;
256extern unsigned int vm_page_active_count;
257extern unsigned int vm_page_inactive_count;
258extern unsigned int vm_page_throttled_count;
259extern unsigned int vm_page_purgeable_count;
260extern unsigned int vm_page_wire_count;
261
316670eb 262#if VM_PRESSURE_EVENTS
6d2010ae 263
39236c6e 264#include "vm_pressure.h"
6d2010ae 265
fe8ab488 266extern boolean_t memorystatus_warn_process(pid_t pid, boolean_t critical);
316670eb 267
39236c6e 268vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal;
316670eb 269
fe8ab488
A
270#if CONFIG_MEMORYSTATUS
271unsigned int memorystatus_available_pages = (unsigned int)-1;
272unsigned int memorystatus_available_pages_pressure = 0;
273unsigned int memorystatus_available_pages_critical = 0;
274unsigned int memorystatus_frozen_count = 0;
275unsigned int memorystatus_suspended_count = 0;
276
277/*
278 * We use this flag to signal if we have any HWM offenders
279 * on the system. This way we can reduce the number of wakeups
280 * of the memorystatus_thread when the system is between the
281 * "pressure" and "critical" threshold.
282 *
283 * The (re-)setting of this variable is done without any locks
284 * or synchronization simply because it is not possible (currently)
285 * to keep track of HWM offenders that drop down below their memory
286 * limit and/or exit. So, we choose to burn a couple of wasted wakeups
287 * by allowing the unguarded modification of this variable.
288 */
289boolean_t memorystatus_hwm_candidates = 0;
290
291static int memorystatus_send_note(int event_code, void *data, size_t data_length);
292#endif /* CONFIG_MEMORYSTATUS */
293
316670eb
A
294#endif /* VM_PRESSURE_EVENTS */
295
316670eb
A
296/* Freeze */
297
298#if CONFIG_FREEZE
299
316670eb
A
300boolean_t memorystatus_freeze_enabled = FALSE;
301int memorystatus_freeze_wakeup = 0;
302
303static inline boolean_t memorystatus_can_freeze_processes(void);
304static boolean_t memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low);
305
306static void memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused);
307
308/* Thresholds */
309static unsigned int memorystatus_freeze_threshold = 0;
310
fe8ab488
A
311static unsigned int memorystatus_freeze_pages_min = 0;
312static unsigned int memorystatus_freeze_pages_max = 0;
316670eb
A
313
314static unsigned int memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT;
315
316/* Stats */
317static uint64_t memorystatus_freeze_count = 0;
318static uint64_t memorystatus_freeze_pageouts = 0;
6d2010ae
A
319
320/* Throttling */
316670eb
A
321static throttle_interval_t throttle_intervals[] = {
322 { 60, 8, 0, 0, { 0, 0 }, FALSE }, /* 1 hour intermediate interval, 8x burst */
6d2010ae
A
323 { 24 * 60, 1, 0, 0, { 0, 0 }, FALSE }, /* 24 hour long interval, no burst */
324};
325
316670eb 326static uint64_t memorystatus_freeze_throttle_count = 0;
6d2010ae 327
39236c6e 328static unsigned int memorystatus_suspended_footprint_total = 0;
6d2010ae 329
39236c6e 330#endif /* CONFIG_FREEZE */
6d2010ae 331
316670eb 332/* Debug */
6d2010ae 333
fe8ab488
A
334extern struct knote *vm_find_knote_from_pid(pid_t, struct klist *);
335
6d2010ae 336#if DEVELOPMENT || DEBUG
6d2010ae 337
39236c6e
A
338#if CONFIG_JETSAM
339
340/* Debug aid to aid determination of limit */
341
342static int
343sysctl_memorystatus_highwater_enable SYSCTL_HANDLER_ARGS
344{
345#pragma unused(oidp, arg2)
346 proc_t p;
347 unsigned int b = 0;
348 int error, enable = 0;
349 int32_t memlimit;
350
351 error = SYSCTL_OUT(req, arg1, sizeof(int));
352 if (error || !req->newptr) {
353 return (error);
354 }
355
356 error = SYSCTL_IN(req, &enable, sizeof(int));
357 if (error || !req->newptr) {
358 return (error);
359 }
360
361 if (!(enable == 0 || enable == 1)) {
362 return EINVAL;
363 }
364
365 proc_list_lock();
366
367 p = memorystatus_get_first_proc_locked(&b, TRUE);
368 while (p) {
369 if (enable) {
370 if ((p->p_memstat_state & P_MEMSTAT_MEMLIMIT_BACKGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) {
371 memlimit = -1;
372 } else {
373 memlimit = p->p_memstat_memlimit;
374 }
375 } else {
376 memlimit = -1;
377 }
378 task_set_phys_footprint_limit_internal(p->task, (memlimit > 0) ? memlimit : -1, NULL, TRUE);
379
fe8ab488
A
380 if (memlimit == -1) {
381 p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
382 } else {
383 if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_BACKGROUND) {
384 p->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT;
385 }
386 }
387
39236c6e
A
388 p = memorystatus_get_next_proc_locked(&b, p, TRUE);
389 }
390
391 memorystatus_highwater_enabled = enable;
392
393 proc_list_unlock();
394
395 return 0;
396}
397
fe8ab488
A
398SYSCTL_INT(_kern, OID_AUTO, memorystatus_idle_snapshot, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_idle_snapshot, 0, "");
399
39236c6e
A
400SYSCTL_PROC(_kern, OID_AUTO, memorystatus_highwater_enabled, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_highwater_enabled, 0, sysctl_memorystatus_highwater_enable, "I", "");
401
402SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
403SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_available_pages_critical, 0, "");
404SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_base, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_critical_base, 0, "");
39236c6e 405SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_idle_offset, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_critical_idle_offset, 0, "");
316670eb
A
406
407/* Diagnostic code */
39236c6e 408
316670eb
A
409enum {
410 kJetsamDiagnosticModeNone = 0,
411 kJetsamDiagnosticModeAll = 1,
412 kJetsamDiagnosticModeStopAtFirstActive = 2,
413 kJetsamDiagnosticModeCount
414} jetsam_diagnostic_mode = kJetsamDiagnosticModeNone;
415
416static int jetsam_diagnostic_suspended_one_active_proc = 0;
417
418static int
419sysctl_jetsam_diagnostic_mode SYSCTL_HANDLER_ARGS
420{
421#pragma unused(arg1, arg2)
422
423 const char *diagnosticStrings[] = {
424 "jetsam: diagnostic mode: resetting critical level.",
425 "jetsam: diagnostic mode: will examine all processes",
426 "jetsam: diagnostic mode: will stop at first active process"
427 };
428
429 int error, val = jetsam_diagnostic_mode;
430 boolean_t changed = FALSE;
431
432 error = sysctl_handle_int(oidp, &val, 0, req);
433 if (error || !req->newptr)
434 return (error);
435 if ((val < 0) || (val >= kJetsamDiagnosticModeCount)) {
436 printf("jetsam: diagnostic mode: invalid value - %d\n", val);
437 return EINVAL;
438 }
439
39236c6e 440 proc_list_lock();
316670eb
A
441
442 if ((unsigned int) val != jetsam_diagnostic_mode) {
443 jetsam_diagnostic_mode = val;
444
445 memorystatus_jetsam_policy &= ~kPolicyDiagnoseActive;
446
447 switch (jetsam_diagnostic_mode) {
448 case kJetsamDiagnosticModeNone:
449 /* Already cleared */
450 break;
451 case kJetsamDiagnosticModeAll:
452 memorystatus_jetsam_policy |= kPolicyDiagnoseAll;
453 break;
454 case kJetsamDiagnosticModeStopAtFirstActive:
455 memorystatus_jetsam_policy |= kPolicyDiagnoseFirst;
456 break;
457 default:
458 /* Already validated */
459 break;
460 }
461
39236c6e 462 memorystatus_update_levels_locked(FALSE);
316670eb
A
463 changed = TRUE;
464 }
465
39236c6e 466 proc_list_unlock();
316670eb
A
467
468 if (changed) {
469 printf("%s\n", diagnosticStrings[val]);
470 }
471
472 return (0);
473}
474
39236c6e 475SYSCTL_PROC(_debug, OID_AUTO, jetsam_diagnostic_mode, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED|CTLFLAG_ANYBODY,
316670eb
A
476 &jetsam_diagnostic_mode, 0, sysctl_jetsam_diagnostic_mode, "I", "Jetsam Diagnostic Mode");
477
39236c6e 478SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jetsam_policy_offset_pages_diagnostic, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_jetsam_policy_offset_pages_diagnostic, 0, "");
316670eb
A
479
480#if VM_PRESSURE_EVENTS
481
39236c6e 482SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_pressure, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_pressure, 0, "");
316670eb 483
316670eb 484
fe8ab488
A
485/*
486 * This routine is used for targeted notifications
487 * regardless of system memory pressure.
488 * "memnote" is the current user.
489 */
316670eb
A
490
491static int
492sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS
493{
494#pragma unused(arg1, arg2)
495
fe8ab488
A
496 int error = 0, pid = 0;
497 int ret = 0;
498 struct knote *kn = NULL;
316670eb
A
499
500 error = sysctl_handle_int(oidp, &pid, 0, req);
501 if (error || !req->newptr)
502 return (error);
503
fe8ab488
A
504 /*
505 * We inspect 3 lists here for targeted notifications:
506 * - memorystatus_klist
507 * - vm_pressure_klist
508 * - vm_pressure_dormant_klist
509 *
510 * The vm_pressure_* lists are tied to the old VM_PRESSURE
511 * notification mechanism. We intend to stop using that
512 * mechanism and, in turn, get rid of the 2 lists and
513 * vm_dispatch_pressure_note_to_pid() too.
514 */
515
516 memorystatus_klist_lock();
517 kn = vm_find_knote_from_pid(pid, &memorystatus_klist);
518 if (kn) {
519 /*
520 * Forcibly send this pid a "warning" memory pressure notification.
521 */
522 kn->kn_fflags |= NOTE_MEMORYSTATUS_PRESSURE_WARN;
523 KNOTE(&memorystatus_klist, kMemorystatusPressure);
524 ret = 0;
525 } else {
526 ret = vm_dispatch_pressure_note_to_pid(pid, FALSE);
527 }
528 memorystatus_klist_unlock();
529
530 return ret;
316670eb
A
531}
532
533SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
534 0, 0, &sysctl_memorystatus_vm_pressure_send, "I", "");
535
536#endif /* VM_PRESSURE_EVENTS */
537
538#endif /* CONFIG_JETSAM */
539
540#if CONFIG_FREEZE
541
39236c6e 542SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_threshold, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_threshold, 0, "");
316670eb 543
39236c6e
A
544SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_min, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_min, 0, "");
545SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_max, 0, "");
316670eb 546
39236c6e
A
547SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_count, "");
548SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_pageouts, "");
549SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_throttle_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_count, "");
550SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_min_processes, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_suspended_threshold, 0, "");
316670eb
A
551
552boolean_t memorystatus_freeze_throttle_enabled = TRUE;
39236c6e 553SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_throttle_enabled, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_enabled, 0, "");
316670eb
A
554
555/*
fe8ab488 556 * Manual trigger of freeze and thaw for dev / debug kernels only.
316670eb
A
557 */
558static int
559sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS
560{
561#pragma unused(arg1, arg2)
562
563 int error, pid = 0;
564 proc_t p;
565
fe8ab488
A
566 if (memorystatus_freeze_enabled == FALSE) {
567 return ENOTSUP;
568 }
569
316670eb
A
570 error = sysctl_handle_int(oidp, &pid, 0, req);
571 if (error || !req->newptr)
572 return (error);
573
574 p = proc_find(pid);
575 if (p != NULL) {
576 uint32_t purgeable, wired, clean, dirty;
577 boolean_t shared;
39236c6e
A
578 uint32_t max_pages = 0;
579
fe8ab488 580 if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) {
39236c6e
A
581 max_pages = MIN(default_pager_swap_pages_free(), memorystatus_freeze_pages_max);
582 } else {
583 max_pages = UINT32_MAX - 1;
584 }
585 error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
316670eb 586 proc_rele(p);
316670eb 587
39236c6e
A
588 if (error)
589 error = EIO;
590 return error;
591 }
316670eb
A
592 return EINVAL;
593}
594
595SYSCTL_PROC(_kern, OID_AUTO, memorystatus_freeze, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
596 0, 0, &sysctl_memorystatus_freeze, "I", "");
597
598static int
599sysctl_memorystatus_available_pages_thaw SYSCTL_HANDLER_ARGS
600{
601#pragma unused(arg1, arg2)
602
603 int error, pid = 0;
604 proc_t p;
605
fe8ab488
A
606 if (memorystatus_freeze_enabled == FALSE) {
607 return ENOTSUP;
608 }
609
316670eb
A
610 error = sysctl_handle_int(oidp, &pid, 0, req);
611 if (error || !req->newptr)
612 return (error);
613
614 p = proc_find(pid);
615 if (p != NULL) {
39236c6e 616 error = task_thaw(p->task);
316670eb 617 proc_rele(p);
39236c6e
A
618
619 if (error)
620 error = EIO;
621 return error;
316670eb
A
622 }
623
624 return EINVAL;
625}
626
627SYSCTL_PROC(_kern, OID_AUTO, memorystatus_thaw, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
628 0, 0, &sysctl_memorystatus_available_pages_thaw, "I", "");
6d2010ae 629
6d2010ae 630#endif /* CONFIG_FREEZE */
2d21ac55 631
fe8ab488
A
632#endif /* DEVELOPMENT || DEBUG */
633
39236c6e
A
634extern kern_return_t kernel_thread_start_priority(thread_continue_t continuation,
635 void *parameter,
636 integer_t priority,
637 thread_t *new_thread);
638
fe8ab488
A
639#if CONFIG_JETSAM
640/*
641 * Sort processes by size for a single jetsam bucket.
642 */
643
644static void memorystatus_sort_by_largest_process_locked(unsigned int bucket_index)
645{
646 proc_t p = NULL, insert_after_proc = NULL, max_proc = NULL;
647 uint32_t pages = 0, max_pages = 0;
648 memstat_bucket_t *current_bucket;
649
650 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
651 return;
652 }
653
654 current_bucket = &memstat_bucket[bucket_index];
655
656 p = TAILQ_FIRST(&current_bucket->list);
657
658 if (p) {
659 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
660 max_pages = pages;
661 insert_after_proc = NULL;
662
663 p = TAILQ_NEXT(p, p_memstat_list);
664
665restart:
666 while (p) {
667
668 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
669
670 if (pages > max_pages) {
671 max_pages = pages;
672 max_proc = p;
673 }
674
675 p = TAILQ_NEXT(p, p_memstat_list);
676 }
677
678 if (max_proc) {
679
680 TAILQ_REMOVE(&current_bucket->list, max_proc, p_memstat_list);
681
682 if (insert_after_proc == NULL) {
683 TAILQ_INSERT_HEAD(&current_bucket->list, max_proc, p_memstat_list);
684 } else {
685 TAILQ_INSERT_AFTER(&current_bucket->list, insert_after_proc, max_proc, p_memstat_list);
686 }
687
688 insert_after_proc = max_proc;
689
690 /* Reset parameters for the new search. */
691 p = TAILQ_NEXT(max_proc, p_memstat_list);
692 if (p) {
693 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
694 max_pages = pages;
695 }
696 max_proc = NULL;
697
698 goto restart;
699 }
700 }
701}
702
703#endif /* CONFIG_JETSAM */
704
39236c6e
A
705static proc_t memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search) {
706 memstat_bucket_t *current_bucket;
707 proc_t next_p;
708
709 if ((*bucket_index) >= MEMSTAT_BUCKET_COUNT) {
710 return NULL;
711 }
712
713 current_bucket = &memstat_bucket[*bucket_index];
714 next_p = TAILQ_FIRST(&current_bucket->list);
715 if (!next_p && search) {
716 while (!next_p && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
717 current_bucket = &memstat_bucket[*bucket_index];
718 next_p = TAILQ_FIRST(&current_bucket->list);
719 }
720 }
721
722 return next_p;
723}
724
725static proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search) {
726 memstat_bucket_t *current_bucket;
727 proc_t next_p;
728
729 if (!p || ((*bucket_index) >= MEMSTAT_BUCKET_COUNT)) {
730 return NULL;
731 }
732
733 next_p = TAILQ_NEXT(p, p_memstat_list);
734 while (!next_p && search && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
735 current_bucket = &memstat_bucket[*bucket_index];
736 next_p = TAILQ_FIRST(&current_bucket->list);
737 }
738
739 return next_p;
740}
316670eb
A
741
742__private_extern__ void
743memorystatus_init(void)
744{
745 thread_t thread = THREAD_NULL;
746 kern_return_t result;
39236c6e
A
747 int i;
748
fe8ab488
A
749#if CONFIG_FREEZE
750 memorystatus_freeze_pages_min = FREEZE_PAGES_MIN;
751 memorystatus_freeze_pages_max = FREEZE_PAGES_MAX;
752#endif
753
39236c6e
A
754 nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_idle_delay_time);
755
756 /* Init buckets */
757 for (i = 0; i < MEMSTAT_BUCKET_COUNT; i++) {
758 TAILQ_INIT(&memstat_bucket[i].list);
759 memstat_bucket[i].count = 0;
760 }
316670eb 761
39236c6e 762 memorystatus_idle_demotion_call = thread_call_allocate((thread_call_func_t)memorystatus_perform_idle_demotion, NULL);
316670eb 763
39236c6e
A
764 /* Apply overrides */
765 PE_get_default("kern.jetsam_delta", &delta_percentage, sizeof(delta_percentage));
766 assert(delta_percentage < 100);
767 PE_get_default("kern.jetsam_critical_threshold", &critical_threshold_percentage, sizeof(critical_threshold_percentage));
768 assert(critical_threshold_percentage < 100);
769 PE_get_default("kern.jetsam_idle_offset", &idle_offset_percentage, sizeof(idle_offset_percentage));
770 assert(idle_offset_percentage < 100);
771 PE_get_default("kern.jetsam_pressure_threshold", &pressure_threshold_percentage, sizeof(pressure_threshold_percentage));
772 assert(pressure_threshold_percentage < 100);
773 PE_get_default("kern.jetsam_freeze_threshold", &freeze_threshold_percentage, sizeof(freeze_threshold_percentage));
774 assert(freeze_threshold_percentage < 100);
316670eb 775
39236c6e
A
776#if CONFIG_JETSAM
777 memorystatus_delta = delta_percentage * atop_64(max_mem) / 100;
39236c6e 778 memorystatus_available_pages_critical_idle_offset = idle_offset_percentage * atop_64(max_mem) / 100;
39236c6e
A
779 memorystatus_available_pages_critical_base = (critical_threshold_percentage / delta_percentage) * memorystatus_delta;
780
781 memorystatus_jetsam_snapshot_max = maxproc;
782 memorystatus_jetsam_snapshot =
783 (memorystatus_jetsam_snapshot_t*)kalloc(sizeof(memorystatus_jetsam_snapshot_t) +
784 sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max);
785 if (!memorystatus_jetsam_snapshot) {
786 panic("Could not allocate memorystatus_jetsam_snapshot");
787 }
788
789 /* No contention at this point */
790 memorystatus_update_levels_locked(FALSE);
791#endif
792
316670eb 793#if CONFIG_FREEZE
39236c6e 794 memorystatus_freeze_threshold = (freeze_threshold_percentage / delta_percentage) * memorystatus_delta;
316670eb 795#endif
39236c6e
A
796
797 result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &thread);
316670eb
A
798 if (result == KERN_SUCCESS) {
799 thread_deallocate(thread);
800 } else {
801 panic("Could not create memorystatus_thread");
802 }
39236c6e 803}
316670eb 804
39236c6e
A
805/* Centralised for the purposes of allowing panic-on-jetsam */
806extern void
807vm_wake_compactor_swapper(void);
316670eb 808
fe8ab488
A
809/*
810 * The jetsam no frills kill call
811 * Return: 0 on success
812 * error code on failure (EINVAL...)
813 */
814static int
815jetsam_do_kill(proc_t p, int jetsam_flags) {
816 int error = 0;
817 error = exit1_internal(p, W_EXITCODE(0, SIGKILL), (int *)NULL, FALSE, FALSE, jetsam_flags);
818 return(error);
819}
820
821/*
822 * Wrapper for processes exiting with memorystatus details
823 */
39236c6e
A
824static boolean_t
825memorystatus_do_kill(proc_t p, uint32_t cause) {
826
fe8ab488
A
827 int error = 0;
828 __unused pid_t victim_pid = p->p_pid;
829
830 KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_START,
831 victim_pid, cause, vm_page_free_count, 0, 0);
39236c6e
A
832
833#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
834 if (memorystatus_jetsam_panic_debug & (1 << cause)) {
835 panic("memorystatus_do_kill(): jetsam debug panic (cause: %d)", cause);
316670eb 836 }
39236c6e
A
837#else
838#pragma unused(cause)
316670eb 839#endif
39236c6e
A
840 int jetsam_flags = P_LTERM_JETSAM;
841 switch (cause) {
842 case kMemorystatusKilledHiwat: jetsam_flags |= P_JETSAM_HIWAT; break;
843 case kMemorystatusKilledVnodes: jetsam_flags |= P_JETSAM_VNODE; break;
844 case kMemorystatusKilledVMPageShortage: jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break;
845 case kMemorystatusKilledVMThrashing: jetsam_flags |= P_JETSAM_VMTHRASHING; break;
fe8ab488 846 case kMemorystatusKilledFCThrashing: jetsam_flags |= P_JETSAM_FCTHRASHING; break;
39236c6e
A
847 case kMemorystatusKilledPerProcessLimit: jetsam_flags |= P_JETSAM_PID; break;
848 case kMemorystatusKilledIdleExit: jetsam_flags |= P_JETSAM_IDLEEXIT; break;
849 }
fe8ab488
A
850 error = jetsam_do_kill(p, jetsam_flags);
851
852 KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_END,
853 victim_pid, cause, vm_page_free_count, error, 0);
39236c6e
A
854
855 if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) {
856 vm_wake_compactor_swapper();
857 }
fe8ab488
A
858
859 return (error == 0);
316670eb
A
860}
861
862/*
863 * Node manipulation
864 */
865
866static void
39236c6e
A
867memorystatus_check_levels_locked(void) {
868#if CONFIG_JETSAM
869 /* Update levels */
870 memorystatus_update_levels_locked(TRUE);
871#endif
872}
316670eb 873
39236c6e
A
874static void
875memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2)
876{
877 proc_t p;
878 uint64_t current_time;
879 memstat_bucket_t *demotion_bucket;
880
881 MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion()\n");
882
883 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_START, 0, 0, 0, 0, 0);
884
885 current_time = mach_absolute_time();
886
887 proc_list_lock();
316670eb 888
39236c6e
A
889 demotion_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE_DEFERRED];
890 p = TAILQ_FIRST(&demotion_bucket->list);
891
892 while (p) {
893 MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion() found %d\n", p->p_pid);
894
895 assert(p->p_memstat_idledeadline);
896 assert(p->p_memstat_dirty & P_DIRTY_DEFER_IN_PROGRESS);
897 assert((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED);
898
899 if (current_time >= p->p_memstat_idledeadline) {
900#if DEBUG || DEVELOPMENT
901 if (!(p->p_memstat_dirty & P_DIRTY_MARKED)) {
fe8ab488
A
902 printf("memorystatus_perform_idle_demotion: moving process %d [%s] to idle band, but never dirtied (0x%x)!\n",
903 p->p_pid, (p->p_comm ? p->p_comm : "(unknown)"), p->p_memstat_dirty);
39236c6e
A
904 }
905#endif
906 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
fe8ab488 907 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, false);
39236c6e
A
908
909 // The prior process has moved out of the demotion bucket, so grab the new head and continue
910 p = TAILQ_FIRST(&demotion_bucket->list);
911 continue;
316670eb 912 }
39236c6e
A
913
914 // No further candidates
915 break;
316670eb 916 }
39236c6e
A
917
918 memorystatus_reschedule_idle_demotion_locked();
919
920 proc_list_unlock();
316670eb 921
39236c6e 922 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
316670eb
A
923}
924
925static void
39236c6e
A
926memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state)
927{
fe8ab488
A
928 boolean_t present_in_deferred_bucket = FALSE;
929
930 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
931 present_in_deferred_bucket = TRUE;
932 }
933
39236c6e
A
934 MEMORYSTATUS_DEBUG(1, "memorystatus_schedule_idle_demotion_locked: scheduling demotion to idle band for process %d (dirty:0x%x, set_state %d, demotions %d).\n",
935 p->p_pid, p->p_memstat_dirty, set_state, memorystatus_scheduled_idle_demotions);
316670eb 936
fe8ab488 937 assert((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED);
316670eb 938
39236c6e
A
939 if (set_state) {
940 assert(p->p_memstat_idledeadline == 0);
fe8ab488 941 p->p_memstat_dirty |= P_DIRTY_DEFER_IN_PROGRESS;
39236c6e 942 p->p_memstat_idledeadline = mach_absolute_time() + memorystatus_idle_delay_time;
316670eb 943 }
39236c6e 944
fe8ab488 945 assert(p->p_memstat_idledeadline);
39236c6e 946
fe8ab488
A
947 if (present_in_deferred_bucket == FALSE) {
948 memorystatus_scheduled_idle_demotions++;
949 }
316670eb
A
950}
951
39236c6e
A
952static void
953memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clear_state)
316670eb 954{
fe8ab488
A
955 boolean_t present_in_deferred_bucket = FALSE;
956
957 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
958 present_in_deferred_bucket = TRUE;
959 assert(p->p_memstat_idledeadline);
960 }
961
39236c6e
A
962 MEMORYSTATUS_DEBUG(1, "memorystatus_invalidate_idle_demotion(): invalidating demotion to idle band for process %d (clear_state %d, demotions %d).\n",
963 p->p_pid, clear_state, memorystatus_scheduled_idle_demotions);
964
39236c6e
A
965
966 if (clear_state) {
967 p->p_memstat_idledeadline = 0;
968 p->p_memstat_dirty &= ~P_DIRTY_DEFER_IN_PROGRESS;
316670eb 969 }
39236c6e 970
fe8ab488
A
971 if (present_in_deferred_bucket == TRUE) {
972 memorystatus_scheduled_idle_demotions--;
973 }
974
39236c6e 975 assert(memorystatus_scheduled_idle_demotions >= 0);
316670eb
A
976}
977
978static void
39236c6e
A
979memorystatus_reschedule_idle_demotion_locked(void) {
980 if (0 == memorystatus_scheduled_idle_demotions) {
981 if (memstat_idle_demotion_deadline) {
982 /* Transitioned 1->0, so cancel next call */
983 thread_call_cancel(memorystatus_idle_demotion_call);
984 memstat_idle_demotion_deadline = 0;
985 }
986 } else {
987 memstat_bucket_t *demotion_bucket;
988 proc_t p;
989 demotion_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE_DEFERRED];
990 p = TAILQ_FIRST(&demotion_bucket->list);
39236c6e 991
fe8ab488
A
992 assert(p && p->p_memstat_idledeadline);
993
994 if (memstat_idle_demotion_deadline != p->p_memstat_idledeadline){
995 thread_call_enter_delayed(memorystatus_idle_demotion_call, p->p_memstat_idledeadline);
996 memstat_idle_demotion_deadline = p->p_memstat_idledeadline;
39236c6e
A
997 }
998 }
316670eb
A
999}
1000
1001/*
1002 * List manipulation
1003 */
1004
39236c6e
A
1005int
1006memorystatus_add(proc_t p, boolean_t locked)
316670eb 1007{
39236c6e
A
1008 memstat_bucket_t *bucket;
1009
fe8ab488 1010 MEMORYSTATUS_DEBUG(1, "memorystatus_list_add(): adding process %d with priority %d.\n", p->p_pid, p->p_memstat_effectivepriority);
39236c6e
A
1011
1012 if (!locked) {
1013 proc_list_lock();
1014 }
1015
1016 /* Processes marked internal do not have priority tracked */
1017 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
1018 goto exit;
1019 }
1020
1021 bucket = &memstat_bucket[p->p_memstat_effectivepriority];
1022
fe8ab488
A
1023 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1024 assert(bucket->count == memorystatus_scheduled_idle_demotions);
1025 }
1026
39236c6e
A
1027 TAILQ_INSERT_TAIL(&bucket->list, p, p_memstat_list);
1028 bucket->count++;
316670eb 1029
39236c6e 1030 memorystatus_list_count++;
316670eb 1031
39236c6e
A
1032 memorystatus_check_levels_locked();
1033
1034exit:
1035 if (!locked) {
1036 proc_list_unlock();
1037 }
1038
1039 return 0;
1040}
316670eb 1041
39236c6e 1042static void
fe8ab488 1043memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert)
39236c6e
A
1044{
1045 memstat_bucket_t *old_bucket, *new_bucket;
1046
1047 assert(priority < MEMSTAT_BUCKET_COUNT);
1048
1049 /* Ensure that exit isn't underway, leaving the proc retained but removed from its bucket */
1050 if ((p->p_listflag & P_LIST_EXITED) != 0) {
1051 return;
316670eb 1052 }
39236c6e 1053
fe8ab488
A
1054 MEMORYSTATUS_DEBUG(1, "memorystatus_update_priority_locked(): setting process %d to priority %d, inserting at %s\n",
1055 p->p_pid, priority, head_insert ? "head" : "tail");
316670eb 1056
39236c6e 1057 old_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
fe8ab488
A
1058 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1059 assert(old_bucket->count == (memorystatus_scheduled_idle_demotions + 1));
1060 }
1061
39236c6e
A
1062 TAILQ_REMOVE(&old_bucket->list, p, p_memstat_list);
1063 old_bucket->count--;
316670eb 1064
39236c6e 1065 new_bucket = &memstat_bucket[priority];
fe8ab488
A
1066 if (head_insert)
1067 TAILQ_INSERT_HEAD(&new_bucket->list, p, p_memstat_list);
1068 else
1069 TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
39236c6e
A
1070 new_bucket->count++;
1071
1072#if CONFIG_JETSAM
1073 if (memorystatus_highwater_enabled && (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_BACKGROUND)) {
fe8ab488
A
1074
1075 /*
1076 * Adjust memory limit based on if the task is going to/from foreground and background.
1077 */
1078
39236c6e
A
1079 if (((priority >= JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority < JETSAM_PRIORITY_FOREGROUND)) ||
1080 ((priority < JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND))) {
1081 int32_t memlimit = (priority >= JETSAM_PRIORITY_FOREGROUND) ? -1 : p->p_memstat_memlimit;
1082 task_set_phys_footprint_limit_internal(p->task, (memlimit > 0) ? memlimit : -1, NULL, TRUE);
fe8ab488
A
1083
1084 if (memlimit <= 0) {
1085 p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
1086 } else {
1087 p->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT;
1088 }
39236c6e
A
1089 }
1090 }
1091#endif
1092
1093 p->p_memstat_effectivepriority = priority;
1094
1095 memorystatus_check_levels_locked();
316670eb
A
1096}
1097
39236c6e 1098int
fe8ab488 1099memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t effective, boolean_t update_memlimit, int32_t memlimit, boolean_t memlimit_background, boolean_t is_fatal_limit)
316670eb 1100{
39236c6e 1101 int ret;
fe8ab488 1102 boolean_t head_insert = false;
39236c6e 1103
316670eb 1104#if !CONFIG_JETSAM
fe8ab488 1105#pragma unused(update_memlimit, memlimit, memlimit_background, is_fatal_limit)
316670eb 1106#endif
316670eb 1107
39236c6e 1108 MEMORYSTATUS_DEBUG(1, "memorystatus_update: changing process %d: priority %d, user_data 0x%llx\n", p->p_pid, priority, user_data);
316670eb 1109
39236c6e
A
1110 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_START, p->p_pid, priority, user_data, effective, 0);
1111
1112 if (priority == -1) {
1113 /* Use as shorthand for default priority */
1114 priority = JETSAM_PRIORITY_DEFAULT;
1115 } else if (priority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1116 /* JETSAM_PRIORITY_IDLE_DEFERRED is reserved for internal use; if requested, adjust to JETSAM_PRIORITY_IDLE. */
1117 priority = JETSAM_PRIORITY_IDLE;
fe8ab488
A
1118 } else if (priority == JETSAM_PRIORITY_IDLE_HEAD) {
1119 /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle queue */
1120 priority = JETSAM_PRIORITY_IDLE;
1121 head_insert = true;
39236c6e
A
1122 } else if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) {
1123 /* Sanity check */
1124 ret = EINVAL;
1125 goto out;
316670eb 1126 }
39236c6e
A
1127
1128 proc_list_lock();
1129
1130 assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
316670eb 1131
39236c6e
A
1132 if (effective && (p->p_memstat_state & P_MEMSTAT_PRIORITYUPDATED)) {
1133 ret = EALREADY;
1134 proc_list_unlock();
fe8ab488
A
1135 MEMORYSTATUS_DEBUG(1, "memorystatus_update: effective change specified for pid %d, but change already occurred.\n", p->p_pid);
1136 goto out;
1137 }
1138
1139 if ((p->p_memstat_state & P_MEMSTAT_TERMINATED) || ((p->p_listflag & P_LIST_EXITED) != 0)) {
1140 /*
1141 * This could happen when a process calling posix_spawn() is exiting on the jetsam thread.
1142 */
1143 ret = EBUSY;
1144 proc_list_unlock();
316670eb
A
1145 goto out;
1146 }
1147
39236c6e
A
1148 p->p_memstat_state |= P_MEMSTAT_PRIORITYUPDATED;
1149 p->p_memstat_userdata = user_data;
1150 p->p_memstat_requestedpriority = priority;
1151
1152#if CONFIG_JETSAM
1153 if (update_memlimit) {
1154 p->p_memstat_memlimit = memlimit;
1155 if (memlimit_background) {
1156 /* Will be set as priority is updated */
1157 p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_BACKGROUND;
fe8ab488
A
1158
1159 /* Cannot have a background memory limit and be fatal. */
1160 is_fatal_limit = FALSE;
1161
316670eb 1162 } else {
39236c6e
A
1163 /* Otherwise, apply now */
1164 if (memorystatus_highwater_enabled) {
1165 task_set_phys_footprint_limit_internal(p->task, (memlimit > 0) ? memlimit : -1, NULL, TRUE);
1166 }
316670eb 1167 }
fe8ab488
A
1168
1169 if (is_fatal_limit || memlimit <= 0) {
1170 p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
1171 } else {
1172 p->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT;
1173 }
316670eb 1174 }
39236c6e 1175#endif
316670eb 1176
fe8ab488
A
1177 /*
1178 * We can't add to the JETSAM_PRIORITY_IDLE_DEFERRED bucket here.
1179 * But, we could be removing it from the bucket.
1180 * Check and take appropriate steps if so.
1181 */
1182
1183 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1184
1185 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1186 }
1187
1188 memorystatus_update_priority_locked(p, priority, head_insert);
39236c6e
A
1189
1190 proc_list_unlock();
1191 ret = 0;
316670eb
A
1192
1193out:
39236c6e
A
1194 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_END, ret, 0, 0, 0, 0);
1195
316670eb
A
1196 return ret;
1197}
1198
39236c6e
A
1199int
1200memorystatus_remove(proc_t p, boolean_t locked)
316670eb 1201{
39236c6e
A
1202 int ret;
1203 memstat_bucket_t *bucket;
316670eb 1204
fe8ab488 1205 MEMORYSTATUS_DEBUG(1, "memorystatus_list_remove: removing process %d\n", p->p_pid);
316670eb 1206
39236c6e
A
1207 if (!locked) {
1208 proc_list_lock();
1209 }
316670eb 1210
39236c6e 1211 assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
fe8ab488 1212
39236c6e 1213 bucket = &memstat_bucket[p->p_memstat_effectivepriority];
fe8ab488
A
1214 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1215 assert(bucket->count == memorystatus_scheduled_idle_demotions);
1216 }
1217
39236c6e
A
1218 TAILQ_REMOVE(&bucket->list, p, p_memstat_list);
1219 bucket->count--;
1220
1221 memorystatus_list_count--;
316670eb 1222
39236c6e
A
1223 /* If awaiting demotion to the idle band, clean up */
1224 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1225 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1226 memorystatus_reschedule_idle_demotion_locked();
1227 }
316670eb 1228
39236c6e
A
1229 memorystatus_check_levels_locked();
1230
1231#if CONFIG_FREEZE
1232 if (p->p_memstat_state & (P_MEMSTAT_FROZEN)) {
1233 memorystatus_frozen_count--;
1234 }
316670eb 1235
39236c6e
A
1236 if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
1237 memorystatus_suspended_footprint_total -= p->p_memstat_suspendedfootprint;
1238 memorystatus_suspended_count--;
316670eb 1239 }
39236c6e
A
1240#endif
1241
1242 if (!locked) {
1243 proc_list_unlock();
1244 }
316670eb 1245
39236c6e
A
1246 if (p) {
1247 ret = 0;
316670eb 1248 } else {
39236c6e 1249 ret = ESRCH;
316670eb
A
1250 }
1251
1252 return ret;
1253}
1254
39236c6e
A
1255static boolean_t
1256memorystatus_validate_track_flags(struct proc *target_p, uint32_t pcontrol) {
1257 /* See that the process isn't marked for termination */
1258 if (target_p->p_memstat_dirty & P_DIRTY_TERMINATED) {
1259 return FALSE;
316670eb
A
1260 }
1261
39236c6e
A
1262 /* Idle exit requires that process be tracked */
1263 if ((pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) &&
1264 !(pcontrol & PROC_DIRTY_TRACK)) {
1265 return FALSE;
1266 }
1267
fe8ab488
A
1268 /* 'Launch in progress' tracking requires that process have enabled dirty tracking too. */
1269 if ((pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) &&
1270 !(pcontrol & PROC_DIRTY_TRACK)) {
1271 return FALSE;
1272 }
1273
39236c6e
A
1274 /* Deferral is only relevant if idle exit is specified */
1275 if ((pcontrol & PROC_DIRTY_DEFER) &&
1276 !(pcontrol & PROC_DIRTY_ALLOWS_IDLE_EXIT)) {
1277 return FALSE;
316670eb
A
1278 }
1279
39236c6e 1280 return TRUE;
316670eb 1281}
593a1d5f 1282
39236c6e
A
1283static void
1284memorystatus_update_idle_priority_locked(proc_t p) {
1285 int32_t priority;
1286
1287 MEMORYSTATUS_DEBUG(1, "memorystatus_update_idle_priority_locked(): pid %d dirty 0x%X\n", p->p_pid, p->p_memstat_dirty);
1288
1289 if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED) {
1290 priority = (p->p_memstat_dirty & P_DIRTY_DEFER_IN_PROGRESS) ? JETSAM_PRIORITY_IDLE_DEFERRED : JETSAM_PRIORITY_IDLE;
1291 } else {
1292 priority = p->p_memstat_requestedpriority;
1293 }
1294
fe8ab488
A
1295 if (priority != p->p_memstat_effectivepriority) {
1296 memorystatus_update_priority_locked(p, priority, false);
1297 }
39236c6e
A
1298}
1299
1300/*
1301 * Processes can opt to have their state tracked by the kernel, indicating when they are busy (dirty) or idle
1302 * (clean). They may also indicate that they support termination when idle, with the result that they are promoted
1303 * to their desired, higher, jetsam priority when dirty (and are therefore killed later), and demoted to the low
1304 * priority idle band when clean (and killed earlier, protecting higher priority procesess).
1305 *
1306 * If the deferral flag is set, then newly tracked processes will be protected for an initial period (as determined by
1307 * memorystatus_idle_delay_time); if they go clean during this time, then they will be moved to a deferred-idle band
1308 * with a slightly higher priority, guarding against immediate termination under memory pressure and being unable to
1309 * make forward progress. Finally, when the guard expires, they will be moved to the standard, lowest-priority, idle
1310 * band. The deferral can be cleared early by clearing the appropriate flag.
1311 *
1312 * The deferral timer is active only for the duration that the process is marked as guarded and clean; if the process
1313 * is marked dirty, the timer will be cancelled. Upon being subsequently marked clean, the deferment will either be
1314 * re-enabled or the guard state cleared, depending on whether the guard deadline has passed.
1315 */
1316
1317int
1318memorystatus_dirty_track(proc_t p, uint32_t pcontrol) {
1319 unsigned int old_dirty;
1320 boolean_t reschedule = FALSE;
fe8ab488
A
1321 boolean_t already_deferred = FALSE;
1322 boolean_t defer_now = FALSE;
39236c6e
A
1323 int ret;
1324
fe8ab488
A
1325 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_TRACK),
1326 p->p_pid, p->p_memstat_dirty, pcontrol, 0, 0);
1327
39236c6e 1328 proc_list_lock();
316670eb 1329
fe8ab488
A
1330 if ((p->p_listflag & P_LIST_EXITED) != 0) {
1331 /*
1332 * Process is on its way out.
1333 */
1334 ret = EBUSY;
1335 goto exit;
1336 }
1337
39236c6e
A
1338 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
1339 ret = EPERM;
1340 goto exit;
316670eb
A
1341 }
1342
39236c6e
A
1343 if (!memorystatus_validate_track_flags(p, pcontrol)) {
1344 ret = EINVAL;
1345 goto exit;
1346 }
1347
1348 old_dirty = p->p_memstat_dirty;
1349
1350 /* These bits are cumulative, as per <rdar://problem/11159924> */
1351 if (pcontrol & PROC_DIRTY_TRACK) {
1352 p->p_memstat_dirty |= P_DIRTY_TRACK;
1353 }
1354
1355 if (pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) {
1356 p->p_memstat_dirty |= P_DIRTY_ALLOW_IDLE_EXIT;
1357 }
1358
fe8ab488
A
1359 if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
1360 p->p_memstat_dirty |= P_DIRTY_LAUNCH_IN_PROGRESS;
1361 }
1362
1363 if (old_dirty & P_DIRTY_DEFER_IN_PROGRESS) {
1364 already_deferred = TRUE;
1365 }
1366
39236c6e 1367 /* This can be set and cleared exactly once. */
fe8ab488
A
1368 if (pcontrol & PROC_DIRTY_DEFER) {
1369
1370 if ( !(old_dirty & P_DIRTY_DEFER)) {
1371 p->p_memstat_dirty |= P_DIRTY_DEFER;
1372 }
1373
1374 defer_now = TRUE;
39236c6e
A
1375 }
1376
fe8ab488 1377 MEMORYSTATUS_DEBUG(1, "memorystatus_on_track_dirty(): set idle-exit %s / defer %s / dirty %s for process %d\n",
39236c6e 1378 ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) ? "Y" : "N",
fe8ab488 1379 defer_now ? "Y" : "N",
39236c6e
A
1380 p->p_memstat_dirty & P_DIRTY ? "Y" : "N",
1381 p->p_pid);
1382
1383 /* Kick off or invalidate the idle exit deferment if there's a state transition. */
1384 if (!(p->p_memstat_dirty & P_DIRTY_IS_DIRTY)) {
1385 if (((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) &&
fe8ab488
A
1386 defer_now && !already_deferred) {
1387
1388 /*
1389 * Request to defer a clean process that's idle-exit enabled
1390 * and not already in the jetsam deferred band.
1391 */
39236c6e
A
1392 memorystatus_schedule_idle_demotion_locked(p, TRUE);
1393 reschedule = TRUE;
fe8ab488
A
1394
1395 } else if (!defer_now && already_deferred) {
1396
1397 /*
1398 * Either the process is no longer idle-exit enabled OR
1399 * there's a request to cancel a currently active deferral.
1400 */
1401 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1402 reschedule = TRUE;
1403 }
1404 } else {
1405
1406 /*
1407 * We are trying to operate on a dirty process. Dirty processes have to
1408 * be removed from the deferred band. The question is do we reset the
1409 * deferred state or not?
1410 *
1411 * This could be a legal request like:
1412 * - this process had opted into the JETSAM_DEFERRED band
1413 * - but it's now dirty and requests to opt out.
1414 * In this case, we remove the process from the band and reset its
1415 * state too. It'll opt back in properly when needed.
1416 *
1417 * OR, this request could be a user-space bug. E.g.:
1418 * - this process had opted into the JETSAM_DEFERRED band when clean
1419 * - and, then issues another request to again put it into the band except
1420 * this time the process is dirty.
1421 * The process going dirty, as a transition in memorystatus_dirty_set(), will pull the process out of
1422 * the deferred band with its state intact. So our request below is no-op.
1423 * But we do it here anyways for coverage.
1424 *
1425 * memorystatus_update_idle_priority_locked()
1426 * single-mindedly treats a dirty process as "cannot be in the deferred band".
1427 */
1428
1429 if (!defer_now && already_deferred) {
39236c6e
A
1430 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1431 reschedule = TRUE;
fe8ab488
A
1432 } else {
1433 memorystatus_invalidate_idle_demotion_locked(p, FALSE);
1434 reschedule = TRUE;
316670eb
A
1435 }
1436 }
39236c6e
A
1437
1438 memorystatus_update_idle_priority_locked(p);
1439
1440 if (reschedule) {
1441 memorystatus_reschedule_idle_demotion_locked();
1442 }
1443
1444 ret = 0;
316670eb 1445
39236c6e
A
1446exit:
1447 proc_list_unlock();
316670eb
A
1448
1449 return ret;
1450}
2d21ac55 1451
39236c6e
A
1452int
1453memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) {
1454 int ret;
1455 boolean_t kill = false;
1456 boolean_t reschedule = FALSE;
1457 boolean_t was_dirty = FALSE;
1458 boolean_t now_dirty = FALSE;
6d2010ae 1459
39236c6e 1460 MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_set(): %d %d 0x%x 0x%x\n", self, p->p_pid, pcontrol, p->p_memstat_dirty);
fe8ab488
A
1461
1462 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_SET), p->p_pid, self, pcontrol, 0, 0);
b0d623f7 1463
39236c6e
A
1464 proc_list_lock();
1465
fe8ab488
A
1466 if ((p->p_listflag & P_LIST_EXITED) != 0) {
1467 /*
1468 * Process is on its way out.
1469 */
1470 ret = EBUSY;
1471 goto exit;
1472 }
1473
39236c6e
A
1474 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
1475 ret = EPERM;
1476 goto exit;
1477 }
1478
1479 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY)
1480 was_dirty = TRUE;
1481
1482 if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
1483 /* Dirty tracking not enabled */
1484 ret = EINVAL;
1485 } else if (pcontrol && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
1486 /*
1487 * Process is set to be terminated and we're attempting to mark it dirty.
1488 * Set for termination and marking as clean is OK - see <rdar://problem/10594349>.
1489 */
1490 ret = EBUSY;
1491 } else {
1492 int flag = (self == TRUE) ? P_DIRTY : P_DIRTY_SHUTDOWN;
1493 if (pcontrol && !(p->p_memstat_dirty & flag)) {
1494 /* Mark the process as having been dirtied at some point */
1495 p->p_memstat_dirty |= (flag | P_DIRTY_MARKED);
1496 memorystatus_dirty_count++;
1497 ret = 0;
1498 } else if ((pcontrol == 0) && (p->p_memstat_dirty & flag)) {
1499 if ((flag == P_DIRTY_SHUTDOWN) && (!p->p_memstat_dirty & P_DIRTY)) {
1500 /* Clearing the dirty shutdown flag, and the process is otherwise clean - kill */
1501 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
1502 kill = true;
1503 } else if ((flag == P_DIRTY) && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
1504 /* Kill previously terminated processes if set clean */
1505 kill = true;
1506 }
1507 p->p_memstat_dirty &= ~flag;
1508 memorystatus_dirty_count--;
1509 ret = 0;
1510 } else {
1511 /* Already set */
1512 ret = EALREADY;
316670eb 1513 }
39236c6e
A
1514 }
1515
1516 if (ret != 0) {
1517 goto exit;
1518 }
1519
1520 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY)
1521 now_dirty = TRUE;
1522
1523 if ((was_dirty == TRUE && now_dirty == FALSE) ||
1524 (was_dirty == FALSE && now_dirty == TRUE)) {
1525
1526 /* Manage idle exit deferral, if applied */
1527 if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_DEFER_IN_PROGRESS)) ==
1528 (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_DEFER_IN_PROGRESS)) {
fe8ab488
A
1529
1530 /*
1531 * P_DIRTY_DEFER_IN_PROGRESS means the process is in the deferred band OR it might be heading back
1532 * there once it's clean again and has some protection window left.
1533 */
1534
39236c6e 1535 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
fe8ab488
A
1536 /*
1537 * New dirty process i.e. "was_dirty == FALSE && now_dirty == TRUE"
1538 *
1539 * The process will move from the deferred band to its higher requested
1540 * jetsam band. But we don't clear its state i.e. we want to remember that
1541 * this process was part of the "deferred" band and will return to it.
1542 *
1543 * This way, we don't let it age beyond the protection
1544 * window when it returns to "clean". All the while giving
1545 * it a chance to perform its work while "dirty".
1546 *
1547 */
39236c6e
A
1548 memorystatus_invalidate_idle_demotion_locked(p, FALSE);
1549 reschedule = TRUE;
1550 } else {
fe8ab488
A
1551
1552 /*
1553 * Process is back from "dirty" to "clean".
1554 *
1555 * Is its timer up OR does it still have some protection
1556 * window left?
1557 */
1558
39236c6e 1559 if (mach_absolute_time() >= p->p_memstat_idledeadline) {
fe8ab488
A
1560 /*
1561 * The process' deadline has expired. It currently
1562 * does not reside in the DEFERRED bucket.
1563 *
1564 * It's on its way to the JETSAM_PRIORITY_IDLE
1565 * bucket via memorystatus_update_idle_priority_locked()
1566 * below.
1567
1568 * So all we need to do is reset all the state on the
1569 * process that's related to the DEFERRED bucket i.e.
1570 * the DIRTY_DEFER_IN_PROGRESS flag and the timer deadline.
1571 *
1572 */
1573
1574 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1575 reschedule = TRUE;
39236c6e 1576 } else {
fe8ab488
A
1577 /*
1578 * It still has some protection window left and so
1579 * we just re-arm the timer without modifying any
1580 * state on the process.
1581 */
39236c6e
A
1582 memorystatus_schedule_idle_demotion_locked(p, FALSE);
1583 reschedule = TRUE;
1584 }
1585 }
1586 }
1587
1588 memorystatus_update_idle_priority_locked(p);
1589
1590 /* If the deferral state changed, reschedule the demotion timer */
1591 if (reschedule) {
1592 memorystatus_reschedule_idle_demotion_locked();
1593 }
1594 }
1595
1596 if (kill) {
1597 psignal(p, SIGKILL);
1598 }
1599
1600exit:
1601 proc_list_unlock();
1602
1603 return ret;
1604}
b0d623f7 1605
39236c6e 1606int
fe8ab488
A
1607memorystatus_dirty_clear(proc_t p, uint32_t pcontrol) {
1608
39236c6e 1609 int ret = 0;
fe8ab488
A
1610
1611 MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_clear(): %d 0x%x 0x%x\n", p->p_pid, pcontrol, p->p_memstat_dirty);
39236c6e 1612
fe8ab488
A
1613 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_CLEAR), p->p_pid, pcontrol, 0, 0, 0);
1614
1615 proc_list_lock();
1616
1617 if ((p->p_listflag & P_LIST_EXITED) != 0) {
1618 /*
1619 * Process is on its way out.
1620 */
1621 ret = EBUSY;
1622 goto exit;
1623 }
1624
1625 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
1626 ret = EPERM;
1627 goto exit;
1628 }
1629
1630 if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
1631 /* Dirty tracking not enabled */
1632 ret = EINVAL;
1633 goto exit;
1634 }
1635
1636 if (!pcontrol || (pcontrol & (PROC_DIRTY_LAUNCH_IN_PROGRESS | PROC_DIRTY_DEFER)) == 0) {
1637 ret = EINVAL;
1638 goto exit;
1639 }
1640
1641 if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
1642 p->p_memstat_dirty &= ~P_DIRTY_LAUNCH_IN_PROGRESS;
1643 }
1644
1645 /* This can be set and cleared exactly once. */
1646 if (pcontrol & PROC_DIRTY_DEFER) {
1647
1648 if (p->p_memstat_dirty & P_DIRTY_DEFER) {
1649
1650 p->p_memstat_dirty &= ~P_DIRTY_DEFER;
1651
1652 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1653 memorystatus_update_idle_priority_locked(p);
1654 memorystatus_reschedule_idle_demotion_locked();
1655 }
1656 }
1657
1658 ret = 0;
1659exit:
1660 proc_list_unlock();
1661
1662 return ret;
1663}
1664
1665int
1666memorystatus_dirty_get(proc_t p) {
1667 int ret = 0;
1668
1669 proc_list_lock();
1670
1671 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
39236c6e
A
1672 ret |= PROC_DIRTY_TRACKED;
1673 if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) {
1674 ret |= PROC_DIRTY_ALLOWS_IDLE_EXIT;
1675 }
1676 if (p->p_memstat_dirty & P_DIRTY) {
1677 ret |= PROC_DIRTY_IS_DIRTY;
1678 }
fe8ab488
A
1679 if (p->p_memstat_dirty & P_DIRTY_LAUNCH_IN_PROGRESS) {
1680 ret |= PROC_DIRTY_LAUNCH_IS_IN_PROGRESS;
1681 }
39236c6e
A
1682 }
1683
1684 proc_list_unlock();
1685
1686 return ret;
1687}
b0d623f7 1688
39236c6e
A
1689int
1690memorystatus_on_terminate(proc_t p) {
1691 int sig;
1692
1693 proc_list_lock();
1694
1695 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
1696
1697 if ((p->p_memstat_dirty & (P_DIRTY_TRACK|P_DIRTY_IS_DIRTY)) == P_DIRTY_TRACK) {
1698 /* Clean; mark as terminated and issue SIGKILL */
1699 sig = SIGKILL;
1700 } else {
1701 /* Dirty, terminated, or state tracking is unsupported; issue SIGTERM to allow cleanup */
1702 sig = SIGTERM;
316670eb 1703 }
39236c6e
A
1704
1705 proc_list_unlock();
1706
1707 return sig;
316670eb 1708}
b0d623f7 1709
316670eb 1710void
39236c6e
A
1711memorystatus_on_suspend(proc_t p)
1712{
316670eb 1713#if CONFIG_FREEZE
39236c6e 1714 uint32_t pages;
fe8ab488 1715 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
39236c6e
A
1716#endif
1717 proc_list_lock();
1718#if CONFIG_FREEZE
1719 p->p_memstat_suspendedfootprint = pages;
1720 memorystatus_suspended_footprint_total += pages;
1721 memorystatus_suspended_count++;
316670eb 1722#endif
39236c6e
A
1723 p->p_memstat_state |= P_MEMSTAT_SUSPENDED;
1724 proc_list_unlock();
1725}
6d2010ae 1726
39236c6e
A
1727void
1728memorystatus_on_resume(proc_t p)
1729{
1730#if CONFIG_FREEZE
1731 boolean_t frozen;
1732 pid_t pid;
1733#endif
6d2010ae 1734
39236c6e 1735 proc_list_lock();
6d2010ae 1736
316670eb 1737#if CONFIG_FREEZE
39236c6e
A
1738 frozen = (p->p_memstat_state & P_MEMSTAT_FROZEN);
1739 if (frozen) {
1740 memorystatus_frozen_count--;
1741 p->p_memstat_state |= P_MEMSTAT_PRIOR_THAW;
1742 }
1743
1744 memorystatus_suspended_footprint_total -= p->p_memstat_suspendedfootprint;
1745 memorystatus_suspended_count--;
1746
1747 pid = p->p_pid;
316670eb 1748#endif
39236c6e
A
1749
1750 p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN);
1751
1752 proc_list_unlock();
1753
1754#if CONFIG_FREEZE
1755 if (frozen) {
1756 memorystatus_freeze_entry_t data = { pid, FALSE, 0 };
1757 memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
316670eb 1758 }
39236c6e 1759#endif
316670eb 1760}
6d2010ae 1761
316670eb 1762void
39236c6e 1763memorystatus_on_inactivity(proc_t p)
6d2010ae 1764{
39236c6e 1765#pragma unused(p)
316670eb
A
1766#if CONFIG_FREEZE
1767 /* Wake the freeze thread */
1768 thread_wakeup((event_t)&memorystatus_freeze_wakeup);
1769#endif
1770}
6d2010ae 1771
39236c6e
A
1772static uint32_t
1773memorystatus_build_state(proc_t p) {
1774 uint32_t snapshot_state = 0;
1775
1776 /* General */
1777 if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
1778 snapshot_state |= kMemorystatusSuspended;
1779 }
1780 if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
1781 snapshot_state |= kMemorystatusFrozen;
1782 }
1783 if (p->p_memstat_state & P_MEMSTAT_PRIOR_THAW) {
1784 snapshot_state |= kMemorystatusWasThawed;
1785 }
1786
1787 /* Tracking */
1788 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
1789 snapshot_state |= kMemorystatusTracked;
1790 }
1791 if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
1792 snapshot_state |= kMemorystatusSupportsIdleExit;
1793 }
1794 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
1795 snapshot_state |= kMemorystatusDirty;
1796 }
1797
1798 return snapshot_state;
1799}
1800
1801#if !CONFIG_JETSAM
1802
1803static boolean_t
1804kill_idle_exit_proc(void)
316670eb 1805{
39236c6e 1806 proc_t p, victim_p = PROC_NULL;
316670eb 1807 uint64_t current_time;
39236c6e
A
1808 boolean_t killed = FALSE;
1809 unsigned int i = 0;
316670eb 1810
39236c6e 1811 /* Pick next idle exit victim. */
316670eb 1812 current_time = mach_absolute_time();
6d2010ae 1813
39236c6e 1814 proc_list_lock();
6d2010ae 1815
39236c6e
A
1816 p = memorystatus_get_first_proc_locked(&i, FALSE);
1817 while (p) {
1818 /* No need to look beyond the idle band */
1819 if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
1820 break;
1821 }
1822
1823 if ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT|P_DIRTY_IS_DIRTY|P_DIRTY_TERMINATED)) == (P_DIRTY_ALLOW_IDLE_EXIT)) {
1824 if (current_time >= p->p_memstat_idledeadline) {
1825 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
1826 victim_p = proc_ref_locked(p);
1827 break;
316670eb
A
1828 }
1829 }
39236c6e
A
1830
1831 p = memorystatus_get_next_proc_locked(&i, p, FALSE);
6d2010ae 1832 }
316670eb 1833
39236c6e
A
1834 proc_list_unlock();
1835
1836 if (victim_p) {
1837 printf("memorystatus_thread: idle exiting pid %d [%s]\n", victim_p->p_pid, (victim_p->p_comm ? victim_p->p_comm : "(unknown)"));
1838 killed = memorystatus_do_kill(victim_p, kMemorystatusKilledIdleExit);
1839 proc_rele(victim_p);
316670eb 1840 }
b0d623f7 1841
39236c6e 1842 return killed;
2d21ac55 1843}
39236c6e 1844#endif
2d21ac55 1845
fe8ab488 1846#if CONFIG_JETSAM
39236c6e
A
1847static void
1848memorystatus_thread_wake(void) {
1849 thread_wakeup((event_t)&memorystatus_wakeup);
b0d623f7 1850}
fe8ab488
A
1851#endif /* CONFIG_JETSAM */
1852
1853extern void vm_pressure_response(void);
b0d623f7 1854
316670eb 1855static int
39236c6e
A
1856memorystatus_thread_block(uint32_t interval_ms, thread_continue_t continuation)
1857{
1858 if (interval_ms) {
1859 assert_wait_timeout(&memorystatus_wakeup, THREAD_UNINT, interval_ms, 1000 * NSEC_PER_USEC);
1860 } else {
1861 assert_wait(&memorystatus_wakeup, THREAD_UNINT);
1862 }
316670eb 1863
39236c6e
A
1864 return thread_block(continuation);
1865}
316670eb 1866
39236c6e
A
1867static void
1868memorystatus_thread(void *param __unused, wait_result_t wr __unused)
1869{
1870 static boolean_t is_vm_privileged = FALSE;
1871#if CONFIG_JETSAM
1872 boolean_t post_snapshot = FALSE;
1873 uint32_t errors = 0;
fe8ab488 1874 uint32_t hwm_kill = 0;
39236c6e 1875#endif
316670eb 1876
39236c6e
A
1877 if (is_vm_privileged == FALSE) {
1878 /*
1879 * It's the first time the thread has run, so just mark the thread as privileged and block.
1880 * This avoids a spurious pass with unset variables, as set out in <rdar://problem/9609402>.
1881 */
1882 thread_wire(host_priv_self(), current_thread(), TRUE);
1883 is_vm_privileged = TRUE;
1884
1885 memorystatus_thread_block(0, memorystatus_thread);
316670eb
A
1886 }
1887
39236c6e
A
1888#if CONFIG_JETSAM
1889
1890 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_START,
1891 memorystatus_available_pages, 0, 0, 0, 0);
316670eb 1892
fe8ab488
A
1893 /*
1894 * Jetsam aware version.
1895 *
1896 * The VM pressure notification thread is working it's way through clients in parallel.
39236c6e 1897 *
fe8ab488
A
1898 * So, while the pressure notification thread is targeting processes in order of
1899 * increasing jetsam priority, we can hopefully reduce / stop it's work by killing
1900 * any processes that have exceeded their highwater mark.
39236c6e 1901 *
fe8ab488
A
1902 * If we run out of HWM processes and our available pages drops below the critical threshold, then,
1903 * we target the least recently used process in order of increasing jetsam priority (exception: the FG band).
39236c6e 1904 */
fe8ab488
A
1905 while (is_thrashing(kill_under_pressure_cause) ||
1906 memorystatus_available_pages <= memorystatus_available_pages_pressure) {
39236c6e
A
1907 boolean_t killed;
1908 int32_t priority;
fe8ab488
A
1909 uint32_t cause;
1910
1911 if (kill_under_pressure_cause) {
1912 cause = kill_under_pressure_cause;
1913 } else {
1914 cause = kMemorystatusKilledVMPageShortage;
1915 }
39236c6e
A
1916
1917#if LEGACY_HIWATER
1918 /* Highwater */
1919 killed = memorystatus_kill_hiwat_proc(&errors);
1920 if (killed) {
fe8ab488 1921 hwm_kill++;
39236c6e
A
1922 post_snapshot = TRUE;
1923 goto done;
fe8ab488
A
1924 } else {
1925 memorystatus_hwm_candidates = FALSE;
1926 }
1927
1928 /* No highwater processes to kill. Continue or stop for now? */
1929 if (!is_thrashing(kill_under_pressure_cause) &&
1930 (memorystatus_available_pages > memorystatus_available_pages_critical)) {
1931 /*
1932 * We are _not_ out of pressure but we are above the critical threshold and there's:
1933 * - no compressor thrashing
1934 * - no more HWM processes left.
1935 * For now, don't kill any other processes.
1936 */
1937
1938 if (hwm_kill == 0) {
1939 memorystatus_thread_wasted_wakeup++;
1940 }
1941
1942 break;
39236c6e
A
1943 }
1944#endif
1945
1946 /* LRU */
1947 killed = memorystatus_kill_top_process(TRUE, cause, &priority, &errors);
1948 if (killed) {
fe8ab488
A
1949 /* Don't generate logs for steady-state idle-exit kills (unless overridden for debug) */
1950 if ((priority != JETSAM_PRIORITY_IDLE) || memorystatus_idle_snapshot) {
39236c6e
A
1951 post_snapshot = TRUE;
1952 }
1953 goto done;
1954 }
fe8ab488
A
1955
1956 if (memorystatus_available_pages <= memorystatus_available_pages_critical) {
1957 /* Under pressure and unable to kill a process - panic */
1958 panic("memorystatus_jetsam_thread: no victim! available pages:%d\n", memorystatus_available_pages);
1959 }
39236c6e
A
1960
1961done:
fe8ab488
A
1962
1963 /*
1964 * We do not want to over-kill when thrashing has been detected.
1965 * To avoid that, we reset the flag here and notify the
1966 * compressor.
39236c6e 1967 */
fe8ab488
A
1968 if (is_thrashing(kill_under_pressure_cause)) {
1969 kill_under_pressure_cause = 0;
1970 vm_thrashing_jetsam_done();
39236c6e 1971 }
39236c6e 1972 }
fe8ab488
A
1973
1974 kill_under_pressure_cause = 0;
1975
39236c6e
A
1976 if (errors) {
1977 memorystatus_clear_errors();
1978 }
1979
1980#if VM_PRESSURE_EVENTS
fe8ab488
A
1981 /*
1982 * LD: We used to target the foreground process first and foremost here.
1983 * Now, we target all processes, starting from the non-suspended, background
1984 * processes first. We will target foreground too.
1985 *
1986 * memorystatus_update_vm_pressure(TRUE);
1987 */
1988 //vm_pressure_response();
39236c6e
A
1989#endif
1990
1991 if (post_snapshot) {
1992 size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
1993 sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count);
1994 memorystatus_jetsam_snapshot->notification_time = mach_absolute_time();
1995 memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
1996 }
1997
1998 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_END,
1999 memorystatus_available_pages, 0, 0, 0, 0);
2000
2001#else /* CONFIG_JETSAM */
2002
fe8ab488
A
2003 /*
2004 * Jetsam not enabled
39236c6e
A
2005 */
2006
39236c6e
A
2007#endif /* CONFIG_JETSAM */
2008
2009 memorystatus_thread_block(0, memorystatus_thread);
2010}
2011
2012#if !CONFIG_JETSAM
fe8ab488
A
2013/*
2014 * Returns TRUE:
2015 * when an idle-exitable proc was killed
2016 * Returns FALSE:
2017 * when there are no more idle-exitable procs found
2018 * when the attempt to kill an idle-exitable proc failed
2019 */
39236c6e 2020boolean_t memorystatus_idle_exit_from_VM(void) {
fe8ab488 2021 return(kill_idle_exit_proc());
39236c6e 2022}
fe8ab488 2023#endif /* !CONFIG_JETSAM */
39236c6e
A
2024
2025#if CONFIG_JETSAM
2026
2027/*
2028 * Callback invoked when allowable physical memory footprint exceeded
2029 * (dirty pages + IOKit mappings)
2030 *
2031 * This is invoked for both advisory, non-fatal per-task high watermarks,
fe8ab488 2032 * as well as the fatal task memory limits.
39236c6e
A
2033 */
2034void
2035memorystatus_on_ledger_footprint_exceeded(boolean_t warning, const int max_footprint_mb)
2036{
2037 proc_t p = current_proc();
fe8ab488
A
2038
2039 if (warning == FALSE) {
2040 printf("process %d (%s) exceeded physical memory footprint limit of %d MB\n",
2041 p->p_pid, p->p_comm, max_footprint_mb);
2042 }
39236c6e
A
2043
2044#if VM_PRESSURE_EVENTS
2045 if (warning == TRUE) {
fe8ab488 2046 if (memorystatus_warn_process(p->p_pid, TRUE /* critical? */) != TRUE) {
39236c6e 2047 /* Print warning, since it's possible that task has not registered for pressure notifications */
fe8ab488 2048 printf("task_exceeded_footprint: failed to warn the current task (exiting, or no handler registered?).\n");
39236c6e
A
2049 }
2050 return;
2051 }
2052#endif /* VM_PRESSURE_EVENTS */
2053
fe8ab488 2054 if ((p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT) == P_MEMSTAT_FATAL_MEMLIMIT) {
39236c6e 2055 /*
fe8ab488
A
2056 * If this process has no high watermark or has a fatal task limit, then we have been invoked because the task
2057 * has violated either the system-wide per-task memory limit OR its own task limit.
39236c6e
A
2058 */
2059 if (memorystatus_kill_process_sync(p->p_pid, kMemorystatusKilledPerProcessLimit) != TRUE) {
2060 printf("task_exceeded_footprint: failed to kill the current task (exiting?).\n");
2061 }
fe8ab488
A
2062 } else {
2063 /*
2064 * HWM offender exists. Done without locks or synchronization.
2065 * See comment near its declaration for more details.
2066 */
2067 memorystatus_hwm_candidates = TRUE;
2068 }
2069}
2070
2071/*
2072 * This is invoked when cpulimits have been exceeded while in fatal mode.
2073 * The jetsam_flags do not apply as those are for memory related kills.
2074 * We call this routine so that the offending process is killed with
2075 * a non-zero exit status.
2076 */
2077void
2078jetsam_on_ledger_cpulimit_exceeded(void)
2079{
2080 int retval = 0;
2081 int jetsam_flags = 0; /* make it obvious */
2082 proc_t p = current_proc();
2083
2084 printf("task_exceeded_cpulimit: killing pid %d [%s]\n",
2085 p->p_pid, (p->p_comm ? p->p_comm : "(unknown)"));
2086
2087 retval = jetsam_do_kill(p, jetsam_flags);
2088
2089 if (retval) {
2090 printf("task_exceeded_cpulimit: failed to kill current task (exiting?).\n");
39236c6e
A
2091 }
2092}
2093
2094static void
fe8ab488 2095memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages)
39236c6e
A
2096{
2097 assert(task);
2098 assert(footprint);
2099
2100 *footprint = (uint32_t)(get_task_phys_footprint(task) / PAGE_SIZE_64);
2101 if (max_footprint) {
2102 *max_footprint = (uint32_t)(get_task_phys_footprint_max(task) / PAGE_SIZE_64);
2103 }
fe8ab488
A
2104 if (max_footprint_lifetime) {
2105 *max_footprint_lifetime = (uint32_t)(get_task_resident_max(task) / PAGE_SIZE_64);
2106 }
2107 if (purgeable_pages) {
2108 *purgeable_pages = (uint32_t)(get_task_purgeable_size(task) / PAGE_SIZE_64);
39236c6e 2109 }
39236c6e
A
2110}
2111
fe8ab488 2112
39236c6e
A
2113static void
2114memorystatus_update_snapshot_locked(proc_t p, uint32_t kill_cause)
2115{
2116 unsigned int i;
2117
2118 for (i = 0; i < memorystatus_jetsam_snapshot_count; i++) {
2119 if (memorystatus_jetsam_snapshot_list[i].pid == p->p_pid) {
2120 /* Update if the priority has changed since the snapshot was taken */
2121 if (memorystatus_jetsam_snapshot_list[i].priority != p->p_memstat_effectivepriority) {
2122 memorystatus_jetsam_snapshot_list[i].priority = p->p_memstat_effectivepriority;
2123 strlcpy(memorystatus_jetsam_snapshot_list[i].name, p->p_comm, MAXCOMLEN+1);
2124 memorystatus_jetsam_snapshot_list[i].state = memorystatus_build_state(p);
2125 memorystatus_jetsam_snapshot_list[i].user_data = p->p_memstat_userdata;
2126 memorystatus_jetsam_snapshot_list[i].fds = p->p_fd->fd_nfiles;
2127 }
2128 memorystatus_jetsam_snapshot_list[i].killed = kill_cause;
2129 return;
2130 }
2131 }
316670eb 2132}
b0d623f7 2133
39236c6e
A
2134void memorystatus_pages_update(unsigned int pages_avail)
2135{
fe8ab488
A
2136 memorystatus_available_pages = pages_avail;
2137
2138#if VM_PRESSURE_EVENTS
2139 /*
2140 * Since memorystatus_available_pages changes, we should
2141 * re-evaluate the pressure levels on the system and
2142 * check if we need to wake the pressure thread.
2143 * We also update memorystatus_level in that routine.
2144 */
2145 vm_pressure_response();
2146
2147 if (memorystatus_available_pages <= memorystatus_available_pages_pressure) {
2148
2149 if (memorystatus_hwm_candidates || (memorystatus_available_pages <= memorystatus_available_pages_critical)) {
2150 memorystatus_thread_wake();
2151 }
2152 }
2153#else /* VM_PRESSURE_EVENTS */
2154
39236c6e
A
2155 boolean_t critical, delta;
2156
316670eb
A
2157 if (!memorystatus_delta) {
2158 return;
2159 }
39236c6e
A
2160
2161 critical = (pages_avail < memorystatus_available_pages_critical) ? TRUE : FALSE;
2162 delta = ((pages_avail >= (memorystatus_available_pages + memorystatus_delta))
2163 || (memorystatus_available_pages >= (pages_avail + memorystatus_delta))) ? TRUE : FALSE;
2164
2165 if (critical || delta) {
39236c6e 2166 memorystatus_level = memorystatus_available_pages * 100 / atop_64(max_mem);
39236c6e 2167 memorystatus_thread_wake();
b0d623f7 2168 }
fe8ab488 2169#endif /* VM_PRESSURE_EVENTS */
316670eb
A
2170}
2171
2172static boolean_t
2173memorystatus_get_snapshot_properties_for_proc_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry)
2174{
fe8ab488
A
2175 clock_sec_t tv_sec;
2176 clock_usec_t tv_usec;
2177
39236c6e 2178 memset(entry, 0, sizeof(memorystatus_jetsam_snapshot_entry_t));
316670eb
A
2179
2180 entry->pid = p->p_pid;
2181 strlcpy(&entry->name[0], p->p_comm, MAXCOMLEN+1);
39236c6e 2182 entry->priority = p->p_memstat_effectivepriority;
fe8ab488 2183 memorystatus_get_task_page_counts(p->task, &entry->pages, &entry->max_pages, &entry->max_pages_lifetime, &entry->purgeable_pages);
39236c6e
A
2184 entry->state = memorystatus_build_state(p);
2185 entry->user_data = p->p_memstat_userdata;
316670eb 2186 memcpy(&entry->uuid[0], &p->p_uuid[0], sizeof(p->p_uuid));
fe8ab488
A
2187 entry->fds = p->p_fd->fd_nfiles;
2188
2189 absolutetime_to_microtime(get_task_cpu_time(p->task), &tv_sec, &tv_usec);
2190 entry->cpu_time.tv_sec = tv_sec;
2191 entry->cpu_time.tv_usec = tv_usec;
316670eb
A
2192
2193 return TRUE;
b0d623f7
A
2194}
2195
2196static void
316670eb 2197memorystatus_jetsam_snapshot_procs_locked(void)
b0d623f7 2198{
39236c6e
A
2199 proc_t p, next_p;
2200 unsigned int b = 0, i = 0;
2201 kern_return_t kr = KERN_SUCCESS;
2202
2203 mach_msg_type_number_t count = HOST_VM_INFO64_COUNT;
2204 vm_statistics64_data_t vm_stat;
2205
2206 if ((kr = host_statistics64(host_self(), HOST_VM_INFO64, (host_info64_t)&vm_stat, &count) != KERN_SUCCESS)) {
2207 printf("memorystatus_jetsam_snapshot_procs_locked: host_statistics64 failed with %d\n", kr);
2208 memset(&memorystatus_jetsam_snapshot->stats, 0, sizeof(memorystatus_jetsam_snapshot->stats));
2209 } else {
2210 memorystatus_jetsam_snapshot->stats.free_pages = vm_stat.free_count;
2211 memorystatus_jetsam_snapshot->stats.active_pages = vm_stat.active_count;
2212 memorystatus_jetsam_snapshot->stats.inactive_pages = vm_stat.inactive_count;
2213 memorystatus_jetsam_snapshot->stats.throttled_pages = vm_stat.throttled_count;
2214 memorystatus_jetsam_snapshot->stats.purgeable_pages = vm_stat.purgeable_count;
2215 memorystatus_jetsam_snapshot->stats.wired_pages = vm_stat.wire_count;
2216
2217 memorystatus_jetsam_snapshot->stats.speculative_pages = vm_stat.speculative_count;
2218 memorystatus_jetsam_snapshot->stats.filebacked_pages = vm_stat.external_page_count;
2219 memorystatus_jetsam_snapshot->stats.anonymous_pages = vm_stat.internal_page_count;
2220 memorystatus_jetsam_snapshot->stats.compressions = vm_stat.compressions;
2221 memorystatus_jetsam_snapshot->stats.decompressions = vm_stat.decompressions;
2222 memorystatus_jetsam_snapshot->stats.compressor_pages = vm_stat.compressor_page_count;
2223 memorystatus_jetsam_snapshot->stats.total_uncompressed_pages_in_compressor = vm_stat.total_uncompressed_pages_in_compressor;
2224 }
2225
2226 next_p = memorystatus_get_first_proc_locked(&b, TRUE);
2227 while (next_p) {
2228 p = next_p;
2229 next_p = memorystatus_get_next_proc_locked(&b, p, TRUE);
2230
316670eb
A
2231 if (FALSE == memorystatus_get_snapshot_properties_for_proc_locked(p, &memorystatus_jetsam_snapshot_list[i])) {
2232 continue;
2233 }
2234
2235 MEMORYSTATUS_DEBUG(0, "jetsam snapshot pid = %d, uuid = %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
b0d623f7
A
2236 p->p_pid,
2237 p->p_uuid[0], p->p_uuid[1], p->p_uuid[2], p->p_uuid[3], p->p_uuid[4], p->p_uuid[5], p->p_uuid[6], p->p_uuid[7],
2238 p->p_uuid[8], p->p_uuid[9], p->p_uuid[10], p->p_uuid[11], p->p_uuid[12], p->p_uuid[13], p->p_uuid[14], p->p_uuid[15]);
316670eb 2239
39236c6e 2240 if (++i == memorystatus_jetsam_snapshot_max) {
b0d623f7
A
2241 break;
2242 }
2243 }
39236c6e
A
2244
2245 memorystatus_jetsam_snapshot->snapshot_time = mach_absolute_time();
2246 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = i;
b0d623f7
A
2247}
2248
39236c6e 2249#if DEVELOPMENT || DEBUG
b0d623f7 2250
39236c6e
A
2251static int
2252memorystatus_cmd_set_panic_bits(user_addr_t buffer, uint32_t buffer_size) {
2253 int ret;
2254 memorystatus_jetsam_panic_options_t debug;
2255
2256 if (buffer_size != sizeof(memorystatus_jetsam_panic_options_t)) {
2257 return EINVAL;
b0d623f7 2258 }
39236c6e
A
2259
2260 ret = copyin(buffer, &debug, buffer_size);
2261 if (ret) {
2262 return ret;
2263 }
2264
2265 /* Panic bits match kMemorystatusKilled* enum */
2266 memorystatus_jetsam_panic_debug = (memorystatus_jetsam_panic_debug & ~debug.mask) | (debug.data & debug.mask);
2267
2268 /* Copyout new value */
2269 debug.data = memorystatus_jetsam_panic_debug;
2270 ret = copyout(&debug, buffer, sizeof(memorystatus_jetsam_panic_options_t));
2271
2272 return ret;
b0d623f7
A
2273}
2274
39236c6e
A
2275#endif
2276
2277/*
2278 * Jetsam a specific process.
2279 */
2280static boolean_t
2281memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause) {
2282 boolean_t killed;
b0d623f7 2283 proc_t p;
39236c6e
A
2284
2285 /* TODO - add a victim queue and push this into the main jetsam thread */
2286
2287 p = proc_find(victim_pid);
2288 if (!p) {
2289 return FALSE;
2290 }
2291
fe8ab488
A
2292 printf("memorystatus: specifically killing pid %d [%s] (%s) - memorystatus_available_pages: %d\n",
2293 victim_pid, (p->p_comm ? p->p_comm : "(unknown)"),
2294 jetsam_kill_cause_name[cause], memorystatus_available_pages);
39236c6e
A
2295
2296 proc_list_lock();
2297
2298 if (memorystatus_jetsam_snapshot_count == 0) {
2299 memorystatus_jetsam_snapshot_procs_locked();
2300 }
2301
2302 memorystatus_update_snapshot_locked(p, cause);
2303 proc_list_unlock();
2304
2305 killed = memorystatus_do_kill(p, cause);
2306 proc_rele(p);
2307
2308 return killed;
2309}
2310
2311/*
2312 * Jetsam the first process in the queue.
2313 */
2314static boolean_t
2315memorystatus_kill_top_process(boolean_t any, uint32_t cause, int32_t *priority, uint32_t *errors)
2316{
2317 pid_t aPid;
2318 proc_t p = PROC_NULL, next_p = PROC_NULL;
2319 boolean_t new_snapshot = FALSE, killed = FALSE;
2320 unsigned int i = 0;
b0d623f7 2321
6d2010ae
A
2322#ifndef CONFIG_FREEZE
2323#pragma unused(any)
2324#endif
316670eb 2325
39236c6e
A
2326 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
2327 memorystatus_available_pages, 0, 0, 0, 0);
6d2010ae 2328
39236c6e 2329 proc_list_lock();
316670eb 2330
fe8ab488
A
2331 memorystatus_sort_by_largest_process_locked(JETSAM_PRIORITY_FOREGROUND);
2332
39236c6e
A
2333 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
2334 while (next_p) {
316670eb
A
2335#if DEVELOPMENT || DEBUG
2336 int activeProcess;
2337 int procSuspendedForDiagnosis;
2338#endif /* DEVELOPMENT || DEBUG */
39236c6e
A
2339
2340 p = next_p;
2341 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
2342
6d2010ae 2343#if DEVELOPMENT || DEBUG
39236c6e
A
2344 activeProcess = p->p_memstat_state & P_MEMSTAT_FOREGROUND;
2345 procSuspendedForDiagnosis = p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED;
6d2010ae 2346#endif /* DEVELOPMENT || DEBUG */
316670eb 2347
39236c6e 2348 aPid = p->p_pid;
316670eb 2349
39236c6e
A
2350 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
2351 continue;
b0d623f7 2352 }
39236c6e 2353
6d2010ae 2354#if DEVELOPMENT || DEBUG
39236c6e
A
2355 if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && procSuspendedForDiagnosis) {
2356 printf("jetsam: continuing after ignoring proc suspended already for diagnosis - %d\n", aPid);
2357 continue;
2358 }
6d2010ae 2359#endif /* DEVELOPMENT || DEBUG */
316670eb 2360
fe8ab488
A
2361 if (cause == kMemorystatusKilledVnodes)
2362 {
2363 /*
2364 * If the system runs out of vnodes, we systematically jetsam
2365 * processes in hopes of stumbling onto a vnode gain that helps
2366 * the system recover. The process that happens to trigger
2367 * this path has no known relationship to the vnode consumption.
2368 * We attempt to safeguard that process e.g: do not jetsam it.
2369 */
2370
2371 if (p == current_proc()) {
2372 /* do not jetsam the current process */
2373 continue;
2374 }
2375 }
2376
6d2010ae 2377#if CONFIG_FREEZE
39236c6e
A
2378 boolean_t skip;
2379 boolean_t reclaim_proc = !(p->p_memstat_state & (P_MEMSTAT_LOCKED | P_MEMSTAT_NORECLAIM));
2380 if (any || reclaim_proc) {
2381 skip = FALSE;
2382 } else {
2383 skip = TRUE;
2384 }
316670eb 2385
39236c6e
A
2386 if (skip) {
2387 continue;
2388 } else
6d2010ae 2389#endif
39236c6e
A
2390 {
2391 if (priority) {
2392 *priority = p->p_memstat_effectivepriority;
2393 }
2394
2395 /*
2396 * Capture a snapshot if none exists and:
2397 * - priority was not requested (this is something other than an ambient kill)
2398 * - the priority was requested *and* the targeted process is not at idle priority
2399 */
2400 if ((memorystatus_jetsam_snapshot_count == 0) &&
fe8ab488 2401 (memorystatus_idle_snapshot || ((!priority) || (priority && (*priority != JETSAM_PRIORITY_IDLE))))) {
39236c6e
A
2402 memorystatus_jetsam_snapshot_procs_locked();
2403 new_snapshot = TRUE;
2404 }
2405
2406 /*
2407 * Mark as terminated so that if exit1() indicates success, but the process (for example)
2408 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
2409 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
2410 * acquisition of the proc lock.
2411 */
2412 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
2413
6d2010ae 2414#if DEVELOPMENT || DEBUG
39236c6e
A
2415 if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && activeProcess) {
2416 MEMORYSTATUS_DEBUG(1, "jetsam: suspending pid %d [%s] (active) for diagnosis - memory_status_level: %d\n",
2417 aPid, (p->p_comm ? p->p_comm: "(unknown)"), memorystatus_level);
2418 memorystatus_update_snapshot_locked(p, kMemorystatusKilledDiagnostic);
2419 p->p_memstat_state |= P_MEMSTAT_DIAG_SUSPENDED;
2420 if (memorystatus_jetsam_policy & kPolicyDiagnoseFirst) {
2421 jetsam_diagnostic_suspended_one_active_proc = 1;
2422 printf("jetsam: returning after suspending first active proc - %d\n", aPid);
2423 }
2424
2425 p = proc_ref_locked(p);
2426 proc_list_unlock();
2427 if (p) {
316670eb
A
2428 task_suspend(p->task);
2429 proc_rele(p);
39236c6e
A
2430 killed = TRUE;
2431 }
2432
2433 goto exit;
2434 } else
6d2010ae 2435#endif /* DEVELOPMENT || DEBUG */
39236c6e
A
2436 {
2437 /* Shift queue, update stats */
2438 memorystatus_update_snapshot_locked(p, cause);
2439
2440 p = proc_ref_locked(p);
2441 proc_list_unlock();
2442 if (p) {
fe8ab488
A
2443 printf("memorystatus: %s %d [%s] (%s) - memorystatus_available_pages: %d\n",
2444 ((p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) ?
2445 "idle exiting pid" : "jetsam killing pid"),
2446 aPid, (p->p_comm ? p->p_comm : "(unknown)"),
2447 jetsam_kill_cause_name[cause], memorystatus_available_pages);
39236c6e
A
2448 killed = memorystatus_do_kill(p, cause);
2449 }
2450
2451 /* Success? */
2452 if (killed) {
6d2010ae 2453 proc_rele(p);
39236c6e 2454 goto exit;
6d2010ae 2455 }
39236c6e
A
2456
2457 /* Failure - unwind and restart. */
2458 proc_list_lock();
2459 proc_rele_locked(p);
2460 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
2461 p->p_memstat_state |= P_MEMSTAT_ERROR;
2462 *errors += 1;
2463 i = 0;
2464 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6d2010ae 2465 }
b0d623f7 2466 }
b0d623f7 2467 }
316670eb 2468
39236c6e 2469 proc_list_unlock();
316670eb 2470
39236c6e
A
2471exit:
2472 /* Clear snapshot if freshly captured and no target was found */
2473 if (new_snapshot && !killed) {
2474 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
316670eb
A
2475 }
2476
39236c6e
A
2477 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
2478 memorystatus_available_pages, killed ? aPid : 0, 0, 0, 0);
b0d623f7 2479
39236c6e 2480 return killed;
316670eb
A
2481}
2482
39236c6e
A
2483#if LEGACY_HIWATER
2484
2485static boolean_t
2486memorystatus_kill_hiwat_proc(uint32_t *errors)
d1ecb069 2487{
39236c6e
A
2488 pid_t aPid = 0;
2489 proc_t p = PROC_NULL, next_p = PROC_NULL;
2490 boolean_t new_snapshot = FALSE, killed = FALSE;
2491 unsigned int i = 0;
316670eb 2492
39236c6e
A
2493 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_START,
2494 memorystatus_available_pages, 0, 0, 0, 0);
316670eb 2495
39236c6e 2496 proc_list_lock();
fe8ab488 2497 memorystatus_sort_by_largest_process_locked(JETSAM_PRIORITY_FOREGROUND);
316670eb 2498
39236c6e
A
2499 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
2500 while (next_p) {
2501 uint32_t footprint;
2502 boolean_t skip;
2503
2504 p = next_p;
2505 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
316670eb 2506
39236c6e 2507 aPid = p->p_pid;
316670eb 2508
39236c6e
A
2509 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
2510 continue;
2511 }
2512
2513 /* skip if no limit set */
2514 if (p->p_memstat_memlimit <= 0) {
2515 continue;
d1ecb069 2516 }
316670eb 2517
39236c6e
A
2518 /* skip if a currently inapplicable limit is encountered */
2519 if ((p->p_memstat_state & P_MEMSTAT_MEMLIMIT_BACKGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) {
2520 continue;
2521 }
2522
2523 footprint = (uint32_t)(get_task_phys_footprint(p->task) / (1024 * 1024));
2524 skip = (((int32_t)footprint) <= p->p_memstat_memlimit);
6d2010ae 2525#if DEVELOPMENT || DEBUG
39236c6e
A
2526 if (!skip && (memorystatus_jetsam_policy & kPolicyDiagnoseActive)) {
2527 if (p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED) {
2528 continue;
6d2010ae 2529 }
39236c6e 2530 }
6d2010ae 2531#endif /* DEVELOPMENT || DEBUG */
316670eb 2532
6d2010ae 2533#if CONFIG_FREEZE
39236c6e
A
2534 if (!skip) {
2535 if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
2536 skip = TRUE;
2537 } else {
2538 skip = FALSE;
2539 }
2540 }
6d2010ae 2541#endif
316670eb 2542
39236c6e
A
2543 if (skip) {
2544 continue;
2545 } else {
fe8ab488
A
2546 MEMORYSTATUS_DEBUG(1, "jetsam: %s pid %d [%s] - %d Mb > 1 (%d Mb)\n",
2547 (memorystatus_jetsam_policy & kPolicyDiagnoseActive) ? "suspending": "killing", aPid, p->p_comm, footprint, p->p_memstat_memlimit);
39236c6e
A
2548
2549 if (memorystatus_jetsam_snapshot_count == 0) {
2550 memorystatus_jetsam_snapshot_procs_locked();
2551 new_snapshot = TRUE;
2552 }
2553
2554 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
2555
6d2010ae 2556#if DEVELOPMENT || DEBUG
39236c6e
A
2557 if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
2558 MEMORYSTATUS_DEBUG(1, "jetsam: pid %d suspended for diagnosis - memorystatus_available_pages: %d\n", aPid, memorystatus_available_pages);
2559 memorystatus_update_snapshot_locked(p, kMemorystatusKilledDiagnostic);
2560 p->p_memstat_state |= P_MEMSTAT_DIAG_SUSPENDED;
2561
2562 p = proc_ref_locked(p);
2563 proc_list_unlock();
2564 if (p) {
6d2010ae
A
2565 task_suspend(p->task);
2566 proc_rele(p);
39236c6e
A
2567 killed = TRUE;
2568 }
2569
2570 goto exit;
2571 } else
6d2010ae 2572#endif /* DEVELOPMENT || DEBUG */
39236c6e
A
2573 {
2574 memorystatus_update_snapshot_locked(p, kMemorystatusKilledHiwat);
2575
2576 p = proc_ref_locked(p);
2577 proc_list_unlock();
2578 if (p) {
2579 printf("memorystatus: jetsam killing pid %d [%s] (highwater) - memorystatus_available_pages: %d\n",
2580 aPid, (p->p_comm ? p->p_comm : "(unknown)"), memorystatus_available_pages);
2581 killed = memorystatus_do_kill(p, kMemorystatusKilledHiwat);
2582 }
2583
2584 /* Success? */
2585 if (killed) {
6d2010ae 2586 proc_rele(p);
39236c6e 2587 goto exit;
6d2010ae 2588 }
6d2010ae 2589
39236c6e
A
2590 /* Failure - unwind and restart. */
2591 proc_list_lock();
2592 proc_rele_locked(p);
2593 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
2594 p->p_memstat_state |= P_MEMSTAT_ERROR;
2595 *errors += 1;
2596 i = 0;
2597 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
2598 }
6d2010ae
A
2599 }
2600 }
316670eb 2601
39236c6e 2602 proc_list_unlock();
316670eb 2603
39236c6e
A
2604exit:
2605 /* Clear snapshot if freshly captured and no target was found */
2606 if (new_snapshot && !killed) {
2607 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
316670eb
A
2608 }
2609
39236c6e
A
2610 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_END,
2611 memorystatus_available_pages, killed ? aPid : 0, 0, 0, 0);
6d2010ae 2612
39236c6e 2613 return killed;
316670eb 2614}
2d21ac55 2615
39236c6e 2616#endif /* LEGACY_HIWATER */
316670eb 2617
39236c6e
A
2618static boolean_t
2619memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause) {
2620 /* TODO: allow a general async path */
fe8ab488
A
2621 if ((victim_pid != -1) || (cause != kMemorystatusKilledVMPageShortage && cause != kMemorystatusKilledVMThrashing &&
2622 cause != kMemorystatusKilledFCThrashing)) {
39236c6e 2623 return FALSE;
316670eb 2624 }
39236c6e 2625
fe8ab488 2626 kill_under_pressure_cause = cause;
39236c6e
A
2627 memorystatus_thread_wake();
2628 return TRUE;
2629}
2d21ac55 2630
39236c6e
A
2631static boolean_t
2632memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause) {
2633 boolean_t res;
2634 uint32_t errors = 0;
2635
2636 if (victim_pid == -1) {
2637 /* No pid, so kill first process */
2638 res = memorystatus_kill_top_process(TRUE, cause, NULL, &errors);
2639 } else {
2640 res = memorystatus_kill_specific_process(victim_pid, cause);
2641 }
2642
2643 if (errors) {
2644 memorystatus_clear_errors();
2645 }
2646
2647 if (res == TRUE) {
2648 /* Fire off snapshot notification */
2649 size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
2650 sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_count;
2651 memorystatus_jetsam_snapshot->notification_time = mach_absolute_time();
2652 memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
2653 }
2654
2655 return res;
2656}
b0d623f7 2657
39236c6e
A
2658boolean_t
2659memorystatus_kill_on_VM_page_shortage(boolean_t async) {
2660 if (async) {
2661 return memorystatus_kill_process_async(-1, kMemorystatusKilledVMPageShortage);
2662 } else {
2663 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMPageShortage);
2664 }
2665}
2d21ac55 2666
39236c6e
A
2667boolean_t
2668memorystatus_kill_on_VM_thrashing(boolean_t async) {
2669 if (async) {
2670 return memorystatus_kill_process_async(-1, kMemorystatusKilledVMThrashing);
2671 } else {
2672 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMThrashing);
2d21ac55
A
2673 }
2674}
b0d623f7 2675
fe8ab488
A
2676boolean_t
2677memorystatus_kill_on_FC_thrashing(boolean_t async) {
2678 if (async) {
2679 return memorystatus_kill_process_async(-1, kMemorystatusKilledFCThrashing);
2680 } else {
2681 return memorystatus_kill_process_sync(-1, kMemorystatusKilledFCThrashing);
2682 }
2683}
2684
39236c6e
A
2685boolean_t
2686memorystatus_kill_on_vnode_limit(void) {
2687 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVnodes);
2688}
2689
316670eb
A
2690#endif /* CONFIG_JETSAM */
2691
6d2010ae
A
2692#if CONFIG_FREEZE
2693
2694__private_extern__ void
316670eb 2695memorystatus_freeze_init(void)
6d2010ae 2696{
316670eb
A
2697 kern_return_t result;
2698 thread_t thread;
39236c6e 2699
316670eb
A
2700 result = kernel_thread_start(memorystatus_freeze_thread, NULL, &thread);
2701 if (result == KERN_SUCCESS) {
2702 thread_deallocate(thread);
2703 } else {
2704 panic("Could not create memorystatus_freeze_thread");
2705 }
6d2010ae
A
2706}
2707
316670eb 2708static int
39236c6e 2709memorystatus_freeze_top_process(boolean_t *memorystatus_freeze_swap_low)
6d2010ae 2710{
39236c6e
A
2711 pid_t aPid = 0;
2712 int ret = -1;
2713 proc_t p = PROC_NULL, next_p = PROC_NULL;
2714 unsigned int i = 0;
6d2010ae 2715
39236c6e
A
2716 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START,
2717 memorystatus_available_pages, 0, 0, 0, 0);
2718
2719 proc_list_lock();
6d2010ae 2720
39236c6e
A
2721 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
2722 while (next_p) {
2723 kern_return_t kr;
2724 uint32_t purgeable, wired, clean, dirty;
2725 boolean_t shared;
2726 uint32_t pages;
2727 uint32_t max_pages = 0;
316670eb
A
2728 uint32_t state;
2729
39236c6e
A
2730 p = next_p;
2731 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
6d2010ae 2732
39236c6e
A
2733 aPid = p->p_pid;
2734 state = p->p_memstat_state;
6d2010ae 2735
316670eb 2736 /* Ensure the process is eligible for freezing */
39236c6e 2737 if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FROZEN)) || !(state & P_MEMSTAT_SUSPENDED)) {
316670eb
A
2738 continue; // with lock held
2739 }
316670eb 2740
39236c6e 2741 /* Only freeze processes meeting our minimum resident page criteria */
fe8ab488 2742 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
39236c6e
A
2743 if (pages < memorystatus_freeze_pages_min) {
2744 continue; // with lock held
2745 }
6d2010ae 2746
fe8ab488 2747 if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) {
316670eb
A
2748 /* Ensure there's enough free space to freeze this process. */
2749 max_pages = MIN(default_pager_swap_pages_free(), memorystatus_freeze_pages_max);
2750 if (max_pages < memorystatus_freeze_pages_min) {
2751 *memorystatus_freeze_swap_low = TRUE;
39236c6e
A
2752 proc_list_unlock();
2753 goto exit;
316670eb 2754 }
39236c6e
A
2755 } else {
2756 max_pages = UINT32_MAX - 1;
2757 }
2758
2759 /* Mark as locked temporarily to avoid kill */
2760 p->p_memstat_state |= P_MEMSTAT_LOCKED;
2761
2762 p = proc_ref_locked(p);
2763 proc_list_unlock();
2764 if (!p) {
2765 goto exit;
2766 }
2767
2768 kr = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
2769
2770 MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_top_process: task_freeze %s for pid %d [%s] - "
2771 "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, shared %d, free swap: %d\n",
2772 (kr == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (p->p_comm ? p->p_comm : "(unknown)"),
2773 memorystatus_available_pages, purgeable, wired, clean, dirty, shared, default_pager_swap_pages_free());
2774
2775 proc_list_lock();
2776 p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
2777
2778 /* Success? */
2779 if (KERN_SUCCESS == kr) {
2780 memorystatus_freeze_entry_t data = { aPid, TRUE, dirty };
316670eb 2781
39236c6e 2782 memorystatus_frozen_count++;
316670eb 2783
39236c6e
A
2784 p->p_memstat_state |= (P_MEMSTAT_FROZEN | (shared ? 0: P_MEMSTAT_NORECLAIM));
2785
2786 /* Update stats */
2787 for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
2788 throttle_intervals[i].pageouts += dirty;
2789 }
2790
2791 memorystatus_freeze_pageouts += dirty;
2792 memorystatus_freeze_count++;
2793
2794 proc_list_unlock();
6d2010ae 2795
39236c6e 2796 memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
6d2010ae 2797
39236c6e
A
2798 /* Return the number of reclaimed pages */
2799 ret = dirty;
6d2010ae 2800
39236c6e
A
2801 } else {
2802 proc_list_unlock();
316670eb 2803 }
39236c6e
A
2804
2805 proc_rele(p);
2806 goto exit;
6d2010ae 2807 }
316670eb 2808
39236c6e
A
2809 proc_list_unlock();
2810
2811exit:
2812 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END,
2813 memorystatus_available_pages, aPid, 0, 0, 0);
316670eb 2814
39236c6e 2815 return ret;
6d2010ae
A
2816}
2817
316670eb
A
2818static inline boolean_t
2819memorystatus_can_freeze_processes(void)
6d2010ae 2820{
316670eb 2821 boolean_t ret;
6d2010ae 2822
39236c6e 2823 proc_list_lock();
316670eb
A
2824
2825 if (memorystatus_suspended_count) {
2826 uint32_t average_resident_pages, estimated_processes;
2827
2828 /* Estimate the number of suspended processes we can fit */
39236c6e 2829 average_resident_pages = memorystatus_suspended_footprint_total / memorystatus_suspended_count;
316670eb
A
2830 estimated_processes = memorystatus_suspended_count +
2831 ((memorystatus_available_pages - memorystatus_available_pages_critical) / average_resident_pages);
2832
2833 /* If it's predicted that no freeze will occur, lower the threshold temporarily */
2834 if (estimated_processes <= FREEZE_SUSPENDED_THRESHOLD_DEFAULT) {
2835 memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_LOW;
6d2010ae 2836 } else {
39236c6e 2837 memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT;
6d2010ae 2838 }
6d2010ae 2839
316670eb
A
2840 MEMORYSTATUS_DEBUG(1, "memorystatus_can_freeze_processes: %d suspended processes, %d average resident pages / process, %d suspended processes estimated\n",
2841 memorystatus_suspended_count, average_resident_pages, estimated_processes);
6d2010ae 2842
316670eb
A
2843 if ((memorystatus_suspended_count - memorystatus_frozen_count) > memorystatus_freeze_suspended_threshold) {
2844 ret = TRUE;
2845 } else {
2846 ret = FALSE;
6d2010ae 2847 }
316670eb
A
2848 } else {
2849 ret = FALSE;
6d2010ae 2850 }
316670eb 2851
39236c6e 2852 proc_list_unlock();
6d2010ae 2853
316670eb 2854 return ret;
6d2010ae
A
2855}
2856
316670eb
A
2857static boolean_t
2858memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low)
6d2010ae 2859{
316670eb
A
2860 /* Only freeze if we're sufficiently low on memory; this holds off freeze right
2861 after boot, and is generally is a no-op once we've reached steady state. */
2862 if (memorystatus_available_pages > memorystatus_freeze_threshold) {
2863 return FALSE;
2864 }
2865
2866 /* Check minimum suspended process threshold. */
2867 if (!memorystatus_can_freeze_processes()) {
2868 return FALSE;
2869 }
6d2010ae 2870
316670eb
A
2871 /* Is swap running low? */
2872 if (*memorystatus_freeze_swap_low) {
2873 /* If there's been no movement in free swap pages since we last attempted freeze, return. */
2874 if (default_pager_swap_pages_free() < memorystatus_freeze_pages_min) {
2875 return FALSE;
2876 }
2877
2878 /* Pages have been freed - we can retry. */
2879 *memorystatus_freeze_swap_low = FALSE;
6d2010ae
A
2880 }
2881
316670eb
A
2882 /* OK */
2883 return TRUE;
6d2010ae
A
2884}
2885
2886static void
316670eb 2887memorystatus_freeze_update_throttle_interval(mach_timespec_t *ts, struct throttle_interval_t *interval)
6d2010ae
A
2888{
2889 if (CMP_MACH_TIMESPEC(ts, &interval->ts) >= 0) {
2890 if (!interval->max_pageouts) {
316670eb 2891 interval->max_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * FREEZE_DAILY_PAGEOUTS_MAX) / (24 * 60)));
6d2010ae 2892 } else {
316670eb 2893 printf("memorystatus_freeze_update_throttle_interval: %d minute throttle timeout, resetting\n", interval->mins);
6d2010ae
A
2894 }
2895 interval->ts.tv_sec = interval->mins * 60;
2896 interval->ts.tv_nsec = 0;
2897 ADD_MACH_TIMESPEC(&interval->ts, ts);
316670eb 2898 /* Since we update the throttle stats pre-freeze, adjust for overshoot here */
6d2010ae
A
2899 if (interval->pageouts > interval->max_pageouts) {
2900 interval->pageouts -= interval->max_pageouts;
2901 } else {
2902 interval->pageouts = 0;
2903 }
2904 interval->throttle = FALSE;
2905 } else if (!interval->throttle && interval->pageouts >= interval->max_pageouts) {
316670eb 2906 printf("memorystatus_freeze_update_throttle_interval: %d minute pageout limit exceeded; enabling throttle\n", interval->mins);
6d2010ae
A
2907 interval->throttle = TRUE;
2908 }
316670eb
A
2909
2910 MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_update_throttle_interval: throttle updated - %d frozen (%d max) within %dm; %dm remaining; throttle %s\n",
6d2010ae
A
2911 interval->pageouts, interval->max_pageouts, interval->mins, (interval->ts.tv_sec - ts->tv_sec) / 60,
2912 interval->throttle ? "on" : "off");
6d2010ae
A
2913}
2914
2915static boolean_t
316670eb 2916memorystatus_freeze_update_throttle(void)
6d2010ae
A
2917{
2918 clock_sec_t sec;
2919 clock_nsec_t nsec;
2920 mach_timespec_t ts;
2921 uint32_t i;
2922 boolean_t throttled = FALSE;
2923
2924#if DEVELOPMENT || DEBUG
316670eb 2925 if (!memorystatus_freeze_throttle_enabled)
6d2010ae
A
2926 return FALSE;
2927#endif
2928
2929 clock_get_system_nanotime(&sec, &nsec);
2930 ts.tv_sec = sec;
2931 ts.tv_nsec = nsec;
2932
316670eb 2933 /* Check freeze pageouts over multiple intervals and throttle if we've exceeded our budget.
6d2010ae 2934 *
316670eb 2935 * This ensures that periods of inactivity can't be used as 'credit' towards freeze if the device has
6d2010ae
A
2936 * remained dormant for a long period. We do, however, allow increased thresholds for shorter intervals in
2937 * order to allow for bursts of activity.
2938 */
2939 for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
316670eb 2940 memorystatus_freeze_update_throttle_interval(&ts, &throttle_intervals[i]);
6d2010ae
A
2941 if (throttle_intervals[i].throttle == TRUE)
2942 throttled = TRUE;
2943 }
2944
2945 return throttled;
2946}
2947
2948static void
316670eb 2949memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused)
6d2010ae 2950{
316670eb
A
2951 static boolean_t memorystatus_freeze_swap_low = FALSE;
2952
2953 if (memorystatus_freeze_enabled) {
2954 if (memorystatus_can_freeze(&memorystatus_freeze_swap_low)) {
fe8ab488
A
2955 /* Only freeze if we've not exceeded our pageout budgets or we're not backed by swap. */
2956 if (DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPLESS ||
2957 !memorystatus_freeze_update_throttle()) {
39236c6e 2958 memorystatus_freeze_top_process(&memorystatus_freeze_swap_low);
316670eb
A
2959 } else {
2960 printf("memorystatus_freeze_thread: in throttle, ignoring freeze\n");
2961 memorystatus_freeze_throttle_count++; /* Throttled, update stats */
2962 }
2963 }
2964 }
6d2010ae 2965
316670eb
A
2966 assert_wait((event_t) &memorystatus_freeze_wakeup, THREAD_UNINT);
2967 thread_block((thread_continue_t) memorystatus_freeze_thread);
2968}
2969
2970#endif /* CONFIG_FREEZE */
6d2010ae 2971
fe8ab488 2972#if VM_PRESSURE_EVENTS
6d2010ae 2973
fe8ab488 2974#if CONFIG_MEMORYSTATUS
316670eb 2975
fe8ab488
A
2976static int
2977memorystatus_send_note(int event_code, void *data, size_t data_length) {
2978 int ret;
2979 struct kev_msg ev_msg;
316670eb 2980
fe8ab488
A
2981 ev_msg.vendor_code = KEV_VENDOR_APPLE;
2982 ev_msg.kev_class = KEV_SYSTEM_CLASS;
2983 ev_msg.kev_subclass = KEV_MEMORYSTATUS_SUBCLASS;
2984
2985 ev_msg.event_code = event_code;
2986
2987 ev_msg.dv[0].data_length = data_length;
2988 ev_msg.dv[0].data_ptr = data;
2989 ev_msg.dv[1].data_length = 0;
2990
2991 ret = kev_post_msg(&ev_msg);
2992 if (ret) {
2993 printf("%s: kev_post_msg() failed, err %d\n", __func__, ret);
316670eb 2994 }
39236c6e 2995
fe8ab488 2996 return ret;
316670eb
A
2997}
2998
fe8ab488
A
2999boolean_t
3000memorystatus_warn_process(pid_t pid, boolean_t critical) {
316670eb 3001
fe8ab488
A
3002 boolean_t ret = FALSE;
3003 struct knote *kn = NULL;
316670eb 3004
fe8ab488
A
3005 /*
3006 * See comment in sysctl_memorystatus_vm_pressure_send.
3007 */
39236c6e 3008
fe8ab488
A
3009 memorystatus_klist_lock();
3010 kn = vm_find_knote_from_pid(pid, &memorystatus_klist);
3011 if (kn) {
3012 /*
3013 * By setting the "fflags" here, we are forcing
3014 * a process to deal with the case where it's
3015 * bumping up into its memory limits. If we don't
3016 * do this here, we will end up depending on the
3017 * system pressure snapshot evaluation in
3018 * filt_memorystatus().
3019 */
39236c6e 3020
fe8ab488
A
3021 if (critical) {
3022 kn->kn_fflags |= NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
39236c6e 3023 } else {
fe8ab488 3024 kn->kn_fflags |= NOTE_MEMORYSTATUS_PRESSURE_WARN;
39236c6e 3025 }
fe8ab488
A
3026 KNOTE(&memorystatus_klist, kMemorystatusPressure);
3027 ret = TRUE;
3028 } else {
3029 if (vm_dispatch_pressure_note_to_pid(pid, FALSE) == 0) {
3030 ret = TRUE;
6d2010ae
A
3031 }
3032 }
fe8ab488 3033 memorystatus_klist_unlock();
6d2010ae 3034
fe8ab488 3035 return ret;
316670eb
A
3036}
3037
39236c6e 3038int
316670eb 3039memorystatus_send_pressure_note(pid_t pid) {
39236c6e
A
3040 MEMORYSTATUS_DEBUG(1, "memorystatus_send_pressure_note(): pid %d\n", pid);
3041 return memorystatus_send_note(kMemorystatusPressureNote, &pid, sizeof(pid));
6d2010ae
A
3042}
3043
fe8ab488
A
3044void
3045memorystatus_send_low_swap_note(void) {
3046
3047 struct knote *kn = NULL;
3048
3049 memorystatus_klist_lock();
3050 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
3051 if (is_knote_registered_modify_task_pressure_bits(kn, NOTE_MEMORYSTATUS_LOW_SWAP, NULL, 0, 0) == TRUE) {
3052 KNOTE(&memorystatus_klist, kMemorystatusLowSwap);
3053 }
3054 }
3055 memorystatus_klist_unlock();
3056}
3057
39236c6e
A
3058boolean_t
3059memorystatus_bg_pressure_eligible(proc_t p) {
3060 boolean_t eligible = FALSE;
3061
3062 proc_list_lock();
3063
3064 MEMORYSTATUS_DEBUG(1, "memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", p->p_pid, p->p_memstat_state);
3065
3066 /* Foreground processes have already been dealt with at this point, so just test for eligibility */
3067 if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) {
3068 eligible = TRUE;
3069 }
3070
3071 proc_list_unlock();
3072
3073 return eligible;
3074}
3075
3076boolean_t
3077memorystatus_is_foreground_locked(proc_t p) {
3078 return ((p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) ||
3079 (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT));
3080}
fe8ab488 3081#endif /* CONFIG_MEMORYSTATUS */
39236c6e
A
3082
3083/*
3084 * Trigger levels to test the mechanism.
3085 * Can be used via a sysctl.
3086 */
3087#define TEST_LOW_MEMORY_TRIGGER_ONE 1
3088#define TEST_LOW_MEMORY_TRIGGER_ALL 2
3089#define TEST_PURGEABLE_TRIGGER_ONE 3
3090#define TEST_PURGEABLE_TRIGGER_ALL 4
3091#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE 5
3092#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL 6
3093
3094boolean_t memorystatus_manual_testing_on = FALSE;
3095vm_pressure_level_t memorystatus_manual_testing_level = kVMPressureNormal;
3096
3097extern struct knote *
fe8ab488 3098vm_pressure_select_optimal_candidate_to_notify(struct klist *, int, boolean_t);
39236c6e
A
3099
3100extern
fe8ab488 3101kern_return_t vm_pressure_notification_without_levels(boolean_t);
39236c6e
A
3102
3103extern void vm_pressure_klist_lock(void);
3104extern void vm_pressure_klist_unlock(void);
3105
3106extern void vm_reset_active_list(void);
3107
3108extern void delay(int);
3109
3110#define INTER_NOTIFICATION_DELAY (250000) /* .25 second */
3111
3112void memorystatus_on_pageout_scan_end(void) {
3113 /* No-op */
3114}
3115
3116/*
3117 * kn_max - knote
3118 *
3119 * knote_pressure_level - to check if the knote is registered for this notification level.
3120 *
3121 * task - task whose bits we'll be modifying
3122 *
3123 * pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again.
3124 *
3125 * pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately.
3126 *
3127 */
39236c6e
A
3128
3129boolean_t
3130is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set)
3131{
3132 if (kn_max->kn_sfflags & knote_pressure_level) {
3133
3134 if (task_has_been_notified(task, pressure_level_to_clear) == TRUE) {
3135
3136 task_clear_has_been_notified(task, pressure_level_to_clear);
3137 }
3138
3139 task_mark_has_been_notified(task, pressure_level_to_set);
3140 return TRUE;
3141 }
3142
3143 return FALSE;
3144}
3145
fe8ab488
A
3146extern kern_return_t vm_pressure_notify_dispatch_vm_clients(boolean_t target_foreground_process);
3147
3148#define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD 5000 /* milliseconds */
39236c6e
A
3149
3150kern_return_t
fe8ab488 3151memorystatus_update_vm_pressure(boolean_t target_foreground_process)
39236c6e
A
3152{
3153 struct knote *kn_max = NULL;
3154 pid_t target_pid = -1;
3155 struct klist dispatch_klist = { NULL };
3156 proc_t target_proc = PROC_NULL;
39236c6e
A
3157 struct task *task = NULL;
3158 boolean_t found_candidate = FALSE;
3159
fe8ab488
A
3160 static vm_pressure_level_t level_snapshot = kVMPressureNormal;
3161 static vm_pressure_level_t prev_level_snapshot = kVMPressureNormal;
3162 boolean_t smoothing_window_started = FALSE;
3163 struct timeval smoothing_window_start_tstamp = {0, 0};
3164 struct timeval curr_tstamp = {0, 0};
3165 int elapsed_msecs = 0;
3166
3167#if !CONFIG_JETSAM
3168#define MAX_IDLE_KILLS 100 /* limit the number of idle kills allowed */
3169
3170 int idle_kill_counter = 0;
3171
3172 /*
3173 * On desktop we take this opportunity to free up memory pressure
3174 * by immediately killing idle exitable processes. We use a delay
3175 * to avoid overkill. And we impose a max counter as a fail safe
3176 * in case daemons re-launch too fast.
3177 */
3178 while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) {
3179 if (memorystatus_idle_exit_from_VM() == FALSE) {
3180 /* No idle exitable processes left to kill */
3181 break;
3182 }
3183 idle_kill_counter++;
3184 delay(1000000); /* 1 second */
3185 }
3186#endif /* !CONFIG_JETSAM */
3187
39236c6e
A
3188 while (1) {
3189
3190 /*
3191 * There is a race window here. But it's not clear
3192 * how much we benefit from having extra synchronization.
3193 */
3194 level_snapshot = memorystatus_vm_pressure_level;
3195
fe8ab488
A
3196 if (prev_level_snapshot > level_snapshot) {
3197 /*
3198 * Pressure decreased? Let's take a little breather
3199 * and see if this condition stays.
3200 */
3201 if (smoothing_window_started == FALSE) {
3202
3203 smoothing_window_started = TRUE;
3204 microuptime(&smoothing_window_start_tstamp);
3205 }
3206
3207 microuptime(&curr_tstamp);
3208 timevalsub(&curr_tstamp, &smoothing_window_start_tstamp);
3209 elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
3210
3211 if (elapsed_msecs < VM_PRESSURE_DECREASED_SMOOTHING_PERIOD) {
3212
3213 delay(INTER_NOTIFICATION_DELAY);
3214 continue;
3215 }
3216 }
3217
3218 prev_level_snapshot = level_snapshot;
3219 smoothing_window_started = FALSE;
3220
39236c6e 3221 memorystatus_klist_lock();
fe8ab488 3222 kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot, target_foreground_process);
39236c6e
A
3223
3224 if (kn_max == NULL) {
3225 memorystatus_klist_unlock();
3226
3227 /*
3228 * No more level-based clients to notify.
3229 * Try the non-level based notification clients.
3230 *
3231 * However, these non-level clients don't understand
3232 * the "return-to-normal" notification.
3233 *
3234 * So don't consider them for those notifications. Just
3235 * return instead.
3236 *
3237 */
3238
3239 if (level_snapshot != kVMPressureNormal) {
3240 goto try_dispatch_vm_clients;
3241 } else {
3242 return KERN_FAILURE;
3243 }
3244 }
3245
3246 target_proc = kn_max->kn_kq->kq_p;
3247
3248 proc_list_lock();
3249 if (target_proc != proc_ref_locked(target_proc)) {
3250 target_proc = PROC_NULL;
3251 proc_list_unlock();
3252 memorystatus_klist_unlock();
3253 continue;
3254 }
3255 proc_list_unlock();
3256 memorystatus_klist_unlock();
3257
3258 target_pid = target_proc->p_pid;
3259
3260 task = (struct task *)(target_proc->task);
3261
3262 if (level_snapshot != kVMPressureNormal) {
3263
3264 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
3265
3266 if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, kVMPressureCritical, kVMPressureWarning) == TRUE) {
3267 found_candidate = TRUE;
3268 }
3269 } else {
3270 if (level_snapshot == kVMPressureCritical) {
3271
3272 if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, kVMPressureWarning, kVMPressureCritical) == TRUE) {
3273 found_candidate = TRUE;
3274 }
3275 }
3276 }
3277 } else {
3278 if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
3279
3280 task_clear_has_been_notified(task, kVMPressureWarning);
3281 task_clear_has_been_notified(task, kVMPressureCritical);
3282
3283 found_candidate = TRUE;
6d2010ae
A
3284 }
3285 }
39236c6e
A
3286
3287 if (found_candidate == FALSE) {
3288 continue;
3289 }
3290
3291 memorystatus_klist_lock();
3292 KNOTE_DETACH(&memorystatus_klist, kn_max);
3293 KNOTE_ATTACH(&dispatch_klist, kn_max);
3294 memorystatus_klist_unlock();
3295
3296 KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure);
3297
3298 memorystatus_klist_lock();
3299 KNOTE_DETACH(&dispatch_klist, kn_max);
3300 KNOTE_ATTACH(&memorystatus_klist, kn_max);
3301 memorystatus_klist_unlock();
3302
3303 microuptime(&target_proc->vm_pressure_last_notify_tstamp);
3304 proc_rele(target_proc);
3305
fe8ab488 3306 if (memorystatus_manual_testing_on == TRUE && target_foreground_process == TRUE) {
39236c6e
A
3307 break;
3308 }
3309
3310try_dispatch_vm_clients:
fe8ab488
A
3311 if (kn_max == NULL && level_snapshot != kVMPressureNormal) {
3312 /*
3313 * We will exit this loop when we are done with
3314 * notification clients (level and non-level based).
39236c6e 3315 */
fe8ab488 3316 if ((vm_pressure_notify_dispatch_vm_clients(target_foreground_process) == KERN_FAILURE) && (kn_max == NULL)) {
39236c6e
A
3317 /*
3318 * kn_max == NULL i.e. we didn't find any eligible clients for the level-based notifications
3319 * AND
3320 * we have failed to find any eligible clients for the non-level based notifications too.
3321 * So, we are done.
3322 */
3323
3324 return KERN_FAILURE;
3325 }
3326 }
3327
fe8ab488
A
3328 /*
3329 * LD: This block of code below used to be invoked in the older memory notification scheme on embedded everytime
3330 * a process was sent a memory pressure notification. The "memorystatus_klist" list was used to hold these
3331 * privileged listeners. But now we have moved to the newer scheme and are trying to move away from the extra
3332 * notifications. So the code is here in case we break compat. and need to send out notifications to the privileged
3333 * apps.
3334 */
3335#if 0
3336#endif /* 0 */
3337
3338 if (memorystatus_manual_testing_on == TRUE) {
3339 /*
3340 * Testing out the pressure notification scheme.
3341 * No need for delays etc.
3342 */
3343 } else {
3344
3345 uint32_t sleep_interval = INTER_NOTIFICATION_DELAY;
3346#if CONFIG_JETSAM
3347 unsigned int page_delta = 0;
3348 unsigned int skip_delay_page_threshold = 0;
3349
3350 assert(memorystatus_available_pages_pressure >= memorystatus_available_pages_critical_base);
3351
3352 page_delta = (memorystatus_available_pages_pressure - memorystatus_available_pages_critical_base) / 2;
3353 skip_delay_page_threshold = memorystatus_available_pages_pressure - page_delta;
3354
3355 if (memorystatus_available_pages <= skip_delay_page_threshold) {
3356 /*
3357 * We are nearing the critcal mark fast and can't afford to wait between
3358 * notifications.
3359 */
3360 sleep_interval = 0;
3361 }
3362#endif /* CONFIG_JETSAM */
3363
3364 if (sleep_interval) {
3365 delay(sleep_interval);
3366 }
39236c6e 3367 }
6d2010ae 3368 }
39236c6e
A
3369
3370 return KERN_SUCCESS;
6d2010ae
A
3371}
3372
39236c6e
A
3373vm_pressure_level_t
3374convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
3375
3376vm_pressure_level_t
3377convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)
3378{
3379 vm_pressure_level_t dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
3380
3381 switch (internal_pressure_level) {
3382
3383 case kVMPressureNormal:
3384 {
3385 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
3386 break;
3387 }
3388
3389 case kVMPressureWarning:
3390 case kVMPressureUrgent:
3391 {
3392 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN;
3393 break;
3394 }
3395
3396 case kVMPressureCritical:
3397 {
3398 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
3399 break;
3400 }
3401
3402 default:
3403 break;
3404 }
316670eb 3405
39236c6e
A
3406 return dispatch_level;
3407}
6d2010ae 3408
b0d623f7 3409static int
39236c6e 3410sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS
b0d623f7 3411{
39236c6e 3412#pragma unused(arg1, arg2, oidp)
39236c6e
A
3413 vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(memorystatus_vm_pressure_level);
3414
3415 return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level));
3416}
3417
fe8ab488
A
3418#if DEBUG || DEVELOPMENT
3419
39236c6e
A
3420SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_LOCKED,
3421 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
3422
fe8ab488
A
3423#else /* DEBUG || DEVELOPMENT */
3424
3425SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_LOCKED|CTLFLAG_MASKED,
3426 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
3427
3428#endif /* DEBUG || DEVELOPMENT */
b0d623f7 3429
39236c6e
A
3430extern int memorystatus_purge_on_warning;
3431extern int memorystatus_purge_on_critical;
3432
3433static int
3434sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
3435{
3436#pragma unused(arg1, arg2)
b0d623f7 3437
39236c6e
A
3438 int level = 0;
3439 int error = 0;
3440 int pressure_level = 0;
3441 int trigger_request = 0;
3442 int force_purge;
3443
3444 error = sysctl_handle_int(oidp, &level, 0, req);
3445 if (error || !req->newptr) {
3446 return (error);
3447 }
3448
3449 memorystatus_manual_testing_on = TRUE;
3450
3451 trigger_request = (level >> 16) & 0xFFFF;
3452 pressure_level = (level & 0xFFFF);
3453
3454 if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE ||
3455 trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) {
3456 return EINVAL;
3457 }
3458 switch (pressure_level) {
3459 case NOTE_MEMORYSTATUS_PRESSURE_NORMAL:
3460 case NOTE_MEMORYSTATUS_PRESSURE_WARN:
3461 case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL:
3462 break;
3463 default:
b0d623f7
A
3464 return EINVAL;
3465 }
b0d623f7 3466
39236c6e
A
3467 /*
3468 * The pressure level is being set from user-space.
3469 * And user-space uses the constants in sys/event.h
3470 * So we translate those events to our internal levels here.
3471 */
3472 if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
3473
3474 memorystatus_manual_testing_level = kVMPressureNormal;
3475 force_purge = 0;
3476
3477 } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) {
3478
3479 memorystatus_manual_testing_level = kVMPressureWarning;
3480 force_purge = memorystatus_purge_on_warning;
3481
3482 } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
3483
3484 memorystatus_manual_testing_level = kVMPressureCritical;
3485 force_purge = memorystatus_purge_on_critical;
b0d623f7
A
3486 }
3487
39236c6e 3488 memorystatus_vm_pressure_level = memorystatus_manual_testing_level;
316670eb 3489
39236c6e
A
3490 /* purge according to the new pressure level */
3491 switch (trigger_request) {
3492 case TEST_PURGEABLE_TRIGGER_ONE:
3493 case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE:
3494 if (force_purge == 0) {
3495 /* no purging requested */
3496 break;
3497 }
3498 vm_purgeable_object_purge_one_unlocked(force_purge);
3499 break;
3500 case TEST_PURGEABLE_TRIGGER_ALL:
3501 case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL:
3502 if (force_purge == 0) {
3503 /* no purging requested */
3504 break;
3505 }
3506 while (vm_purgeable_object_purge_one_unlocked(force_purge));
3507 break;
3508 }
3509
3510 if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) ||
3511 (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) {
3512
3513 memorystatus_update_vm_pressure(TRUE);
3514 }
3515
3516 if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) ||
3517 (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) {
3518
3519 while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) {
3520 continue;
3521 }
3522 }
3523
3524 if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
3525 memorystatus_manual_testing_on = FALSE;
3526
3527 vm_pressure_klist_lock();
3528 vm_reset_active_list();
3529 vm_pressure_klist_unlock();
3530 } else {
3531
3532 vm_pressure_klist_lock();
fe8ab488 3533 vm_pressure_notification_without_levels(FALSE);
39236c6e
A
3534 vm_pressure_klist_unlock();
3535 }
3536
3537 return 0;
b0d623f7
A
3538}
3539
39236c6e
A
3540SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
3541 0, 0, &sysctl_memorypressure_manual_trigger, "I", "");
3542
3543
3544extern int memorystatus_purge_on_warning;
3545extern int memorystatus_purge_on_urgent;
3546extern int memorystatus_purge_on_critical;
3547
3548SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_warning, 0, "");
3549SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_urgent, 0, "");
3550SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_critical, 0, "");
3551
3552
fe8ab488 3553#endif /* VM_PRESSURE_EVENTS */
39236c6e
A
3554
3555/* Return both allocated and actual size, since there's a race between allocation and list compilation */
b0d623f7 3556static int
39236c6e 3557memorystatus_get_priority_list(memorystatus_priority_entry_t **list_ptr, size_t *buffer_size, size_t *list_size, boolean_t size_only)
b0d623f7 3558{
316670eb 3559 uint32_t list_count, i = 0;
39236c6e
A
3560 memorystatus_priority_entry_t *list_entry;
3561 proc_t p;
3562
316670eb 3563 list_count = memorystatus_list_count;
39236c6e
A
3564 *list_size = sizeof(memorystatus_priority_entry_t) * list_count;
3565
3566 /* Just a size check? */
3567 if (size_only) {
3568 return 0;
3569 }
3570
3571 /* Otherwise, validate the size of the buffer */
3572 if (*buffer_size < *list_size) {
3573 return EINVAL;
3574 }
3575
3576 *list_ptr = (memorystatus_priority_entry_t*)kalloc(*list_size);
3577 if (!list_ptr) {
316670eb
A
3578 return ENOMEM;
3579 }
3580
39236c6e
A
3581 memset(*list_ptr, 0, *list_size);
3582
3583 *buffer_size = *list_size;
3584 *list_size = 0;
3585
3586 list_entry = *list_ptr;
3587
3588 proc_list_lock();
3589
3590 p = memorystatus_get_first_proc_locked(&i, TRUE);
3591 while (p && (*list_size < *buffer_size)) {
3592 list_entry->pid = p->p_pid;
3593 list_entry->priority = p->p_memstat_effectivepriority;
3594 list_entry->user_data = p->p_memstat_userdata;
3595#if LEGACY_HIWATER
3596 if (((p->p_memstat_state & P_MEMSTAT_MEMLIMIT_BACKGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) ||
3597 (p->p_memstat_memlimit <= 0)) {
3598 task_get_phys_footprint_limit(p->task, &list_entry->limit);
3599 } else {
3600 list_entry->limit = p->p_memstat_memlimit;
3601 }
3602#else
3603 task_get_phys_footprint_limit(p->task, &list_entry->limit);
3604#endif
3605 list_entry->state = memorystatus_build_state(p);
3606 list_entry++;
3607
3608 *list_size += sizeof(memorystatus_priority_entry_t);
3609
3610 p = memorystatus_get_next_proc_locked(&i, p, TRUE);
316670eb
A
3611 }
3612
39236c6e 3613 proc_list_unlock();
316670eb 3614
39236c6e 3615 MEMORYSTATUS_DEBUG(1, "memorystatus_get_priority_list: returning %lu for size\n", (unsigned long)*list_size);
316670eb 3616
39236c6e
A
3617 return 0;
3618}
b0d623f7 3619
39236c6e
A
3620static int
3621memorystatus_cmd_get_priority_list(user_addr_t buffer, size_t buffer_size, int32_t *retval) {
3622 int error = EINVAL;
3623 boolean_t size_only;
3624 memorystatus_priority_entry_t *list = NULL;
3625 size_t list_size;
316670eb 3626
39236c6e
A
3627 size_only = ((buffer == USER_ADDR_NULL) ? TRUE: FALSE);
3628
3629 error = memorystatus_get_priority_list(&list, &buffer_size, &list_size, size_only);
3630 if (error) {
3631 goto out;
3632 }
3633
3634 if (!size_only) {
3635 error = copyout(list, buffer, list_size);
3636 }
3637
3638 if (error == 0) {
3639 *retval = list_size;
3640 }
3641out:
3642
3643 if (list) {
3644 kfree(list, buffer_size);
3645 }
3646
3647 return error;
316670eb 3648}
b0d623f7 3649
39236c6e
A
3650#if CONFIG_JETSAM
3651
3652static void
3653memorystatus_clear_errors(void)
3654{
3655 proc_t p;
3656 unsigned int i = 0;
3657
3658 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_START, 0, 0, 0, 0, 0);
3659
3660 proc_list_lock();
3661
3662 p = memorystatus_get_first_proc_locked(&i, TRUE);
3663 while (p) {
3664 if (p->p_memstat_state & P_MEMSTAT_ERROR) {
3665 p->p_memstat_state &= ~P_MEMSTAT_ERROR;
3666 }
3667 p = memorystatus_get_next_proc_locked(&i, p, TRUE);
3668 }
3669
3670 proc_list_unlock();
3671
3672 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_END, 0, 0, 0, 0, 0);
3673}
b0d623f7 3674
316670eb 3675static void
39236c6e 3676memorystatus_update_levels_locked(boolean_t critical_only) {
fe8ab488 3677
39236c6e 3678 memorystatus_available_pages_critical = memorystatus_available_pages_critical_base;
fe8ab488
A
3679
3680 /*
3681 * If there's an entry in the first bucket, we have idle processes.
3682 */
3683 memstat_bucket_t *first_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
3684 if (first_bucket->count) {
3685 memorystatus_available_pages_critical += memorystatus_available_pages_critical_idle_offset;
3686
3687 if (memorystatus_available_pages_critical > memorystatus_available_pages_pressure ) {
3688 /*
3689 * The critical threshold must never exceed the pressure threshold
3690 */
3691 memorystatus_available_pages_critical = memorystatus_available_pages_pressure;
39236c6e
A
3692 }
3693 }
fe8ab488 3694
316670eb
A
3695#if DEBUG || DEVELOPMENT
3696 if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
3697 memorystatus_available_pages_critical += memorystatus_jetsam_policy_offset_pages_diagnostic;
fe8ab488
A
3698
3699 if (memorystatus_available_pages_critical > memorystatus_available_pages_pressure ) {
3700 /*
3701 * The critical threshold must never exceed the pressure threshold
3702 */
3703 memorystatus_available_pages_critical = memorystatus_available_pages_pressure;
3704 }
39236c6e
A
3705 }
3706#endif
3707
3708 if (critical_only) {
3709 return;
3710 }
3711
316670eb 3712#if VM_PRESSURE_EVENTS
39236c6e
A
3713 memorystatus_available_pages_pressure = (pressure_threshold_percentage / delta_percentage) * memorystatus_delta;
3714#if DEBUG || DEVELOPMENT
3715 if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
316670eb 3716 memorystatus_available_pages_pressure += memorystatus_jetsam_policy_offset_pages_diagnostic;
316670eb
A
3717 }
3718#endif
39236c6e
A
3719#endif
3720}
3721
3722static int
3723memorystatus_get_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only) {
3724 size_t input_size = *snapshot_size;
316670eb 3725
39236c6e
A
3726 if (memorystatus_jetsam_snapshot_count > 0) {
3727 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count));
3728 } else {
3729 *snapshot_size = 0;
3730 }
3731
3732 if (size_only) {
3733 return 0;
316670eb 3734 }
39236c6e
A
3735
3736 if (input_size < *snapshot_size) {
3737 return EINVAL;
3738 }
3739
3740 *snapshot = memorystatus_jetsam_snapshot;
3741
3742 MEMORYSTATUS_DEBUG(1, "memorystatus_snapshot: returning %ld for size\n", (long)*snapshot_size);
3743
3744 return 0;
316670eb
A
3745}
3746
fe8ab488 3747
316670eb 3748static int
39236c6e
A
3749memorystatus_cmd_get_jetsam_snapshot(user_addr_t buffer, size_t buffer_size, int32_t *retval) {
3750 int error = EINVAL;
3751 boolean_t size_only;
3752 memorystatus_jetsam_snapshot_t *snapshot;
3753
3754 size_only = ((buffer == USER_ADDR_NULL) ? TRUE : FALSE);
3755
3756 error = memorystatus_get_snapshot(&snapshot, &buffer_size, size_only);
3757 if (error) {
3758 goto out;
3759 }
316670eb 3760
39236c6e
A
3761 /* Copy out and reset */
3762 if (!size_only) {
3763 if ((error = copyout(snapshot, buffer, buffer_size)) == 0) {
3764 snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
3765 }
3766 }
316670eb 3767
39236c6e
A
3768 if (error == 0) {
3769 *retval = buffer_size;
3770 }
3771out:
3772 return error;
3773}
316670eb 3774
fe8ab488
A
3775/*
3776 * Routine: memorystatus_cmd_grp_set_properties
3777 * Purpose: Update properties for a group of processes.
3778 *
3779 * Supported Properties:
3780 * [priority]
3781 * Move each process out of its effective priority
3782 * band and into a new priority band.
3783 * Maintains relative order from lowest to highest priority.
3784 * In single band, maintains relative order from head to tail.
3785 *
3786 * eg: before [effectivepriority | pid]
3787 * [18 | p101 ]
3788 * [17 | p55, p67, p19 ]
3789 * [12 | p103 p10 ]
3790 * [ 7 | p25 ]
3791 * [ 0 | p71, p82, ]
3792 *
3793 * after [ new band | pid]
3794 * [ xxx | p71, p82, p25, p103, p10, p55, p67, p19, p101]
3795 *
3796 * Returns: 0 on success, else non-zero.
3797 *
3798 * Caveat: We know there is a race window regarding recycled pids.
3799 * A process could be killed before the kernel can act on it here.
3800 * If a pid cannot be found in any of the jetsam priority bands,
3801 * then we simply ignore it. No harm.
3802 * But, if the pid has been recycled then it could be an issue.
3803 * In that scenario, we might move an unsuspecting process to the new
3804 * priority band. It's not clear how the kernel can safeguard
3805 * against this, but it would be an extremely rare case anyway.
3806 * The caller of this api might avoid such race conditions by
3807 * ensuring that the processes passed in the pid list are suspended.
3808 */
3809
3810
3811/* This internal structure can expand when we add support for more properties */
3812typedef struct memorystatus_internal_properties
3813{
3814 proc_t proc;
3815 int32_t priority; /* see memorytstatus_priority_entry_t : priority */
3816} memorystatus_internal_properties_t;
3817
3818
3819static int
3820memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
3821
3822#pragma unused (flags)
3823
3824 /*
3825 * We only handle setting priority
3826 * per process
3827 */
3828
3829 int error = 0;
3830 memorystatus_priority_entry_t *entries = NULL;
3831 uint32_t entry_count = 0;
3832
3833 /* This will be the ordered proc list */
3834 memorystatus_internal_properties_t *table = NULL;
3835 size_t table_size = 0;
3836 uint32_t table_count = 0;
3837
3838 uint32_t i = 0;
3839 uint32_t bucket_index = 0;
3840 boolean_t head_insert;
3841 int32_t new_priority;
3842
3843 proc_t p;
3844
3845 /* Verify inputs */
3846 if ((buffer == USER_ADDR_NULL) || (buffer_size == 0) || ((buffer_size % sizeof(memorystatus_priority_entry_t)) != 0)) {
3847 error = EINVAL;
3848 goto out;
3849 }
3850
3851 entry_count = (buffer_size / sizeof(memorystatus_priority_entry_t));
3852 if ((entries = (memorystatus_priority_entry_t *)kalloc(buffer_size)) == NULL) {
3853 error = ENOMEM;
3854 goto out;
3855 }
3856
3857 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, entry_count, 0, 0, 0, 0);
3858
3859 if ((error = copyin(buffer, entries, buffer_size)) != 0) {
3860 goto out;
3861 }
3862
3863 /* Verify sanity of input priorities */
3864 for (i=0; i < entry_count; i++) {
3865 if (entries[i].priority == -1) {
3866 /* Use as shorthand for default priority */
3867 entries[i].priority = JETSAM_PRIORITY_DEFAULT;
3868 } else if (entries[i].priority == JETSAM_PRIORITY_IDLE_DEFERRED) {
3869 /* JETSAM_PRIORITY_IDLE_DEFERRED is reserved for internal use;
3870 * if requested, adjust to JETSAM_PRIORITY_IDLE. */
3871 entries[i].priority = JETSAM_PRIORITY_IDLE;
3872 } else if (entries[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
3873 /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle
3874 * queue */
3875 /* Deal with this later */
3876 } else if ((entries[i].priority < 0) || (entries[i].priority >= MEMSTAT_BUCKET_COUNT)) {
3877 /* Sanity check */
3878 error = EINVAL;
3879 goto out;
3880 }
3881 }
3882
3883 table_size = sizeof(memorystatus_internal_properties_t) * entry_count;
3884 if ( (table = (memorystatus_internal_properties_t *)kalloc(table_size)) == NULL) {
3885 error = ENOMEM;
3886 goto out;
3887 }
3888 memset(table, 0, table_size);
3889
3890
3891 /*
3892 * For each jetsam bucket entry, spin through the input property list.
3893 * When a matching pid is found, populate an adjacent table with the
3894 * appropriate proc pointer and new property values.
3895 * This traversal automatically preserves order from lowest
3896 * to highest priority.
3897 */
3898
3899 bucket_index=0;
3900
3901 proc_list_lock();
3902
3903 /* Create the ordered table */
3904 p = memorystatus_get_first_proc_locked(&bucket_index, TRUE);
3905 while (p && (table_count < entry_count)) {
3906 for (i=0; i < entry_count; i++ ) {
3907 if (p->p_pid == entries[i].pid) {
3908 /* Build the table data */
3909 table[table_count].proc = p;
3910 table[table_count].priority = entries[i].priority;
3911 table_count++;
3912 break;
3913 }
3914 }
3915 p = memorystatus_get_next_proc_locked(&bucket_index, p, TRUE);
3916 }
3917
3918 /* We now have ordered list of procs ready to move */
3919 for (i=0; i < table_count; i++) {
3920 p = table[i].proc;
3921 assert(p != NULL);
3922
3923 /* Allow head inserts -- but relative order is now */
3924 if (table[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
3925 new_priority = JETSAM_PRIORITY_IDLE;
3926 head_insert = true;
3927 } else {
3928 new_priority = table[i].priority;
3929 head_insert = false;
3930 }
3931
3932 /* Not allowed */
3933 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3934 continue;
3935 }
3936
3937 /*
3938 * Take appropriate steps if moving proc out of the
3939 * JETSAM_PRIORITY_IDLE_DEFERRED band.
3940 */
3941 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
3942 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
3943 }
3944
3945 memorystatus_update_priority_locked(p, new_priority, head_insert);
3946 }
3947
3948 proc_list_unlock();
3949
3950 /*
3951 * if (table_count != entry_count)
3952 * then some pids were not found in a jetsam band.
3953 * harmless but interesting...
3954 */
3955 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, entry_count, table_count, 0, 0, 0);
3956
3957out:
3958 if (entries)
3959 kfree(entries, buffer_size);
3960 if (table)
3961 kfree(table, table_size);
3962
3963 return (error);
3964}
3965
3966
3967/*
3968 * This routine is meant solely for the purpose of adjusting jetsam priorities and bands.
3969 * It is _not_ meant to be used for the setting of memory limits, especially, since we can't
3970 * tell if the memory limit being set is fatal or not.
3971 *
3972 * So the the last 5 args to the memorystatus_update() call below, related to memory limits, are all 0 or FALSE.
3973 */
3974
39236c6e
A
3975static int
3976memorystatus_cmd_set_priority_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
3977 const uint32_t MAX_ENTRY_COUNT = 2; /* Cap the entry count */
316670eb 3978
39236c6e
A
3979 int error;
3980 uint32_t i;
3981 uint32_t entry_count;
3982 memorystatus_priority_properties_t *entries;
3983
3984 /* Validate inputs */
3985 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size == 0)) {
3986 return EINVAL;
3987 }
3988
3989 /* Make sure the buffer is a multiple of the entry size, and that an excessive size isn't specified */
3990 entry_count = (buffer_size / sizeof(memorystatus_priority_properties_t));
3991 if (((buffer_size % sizeof(memorystatus_priority_properties_t)) != 0) || (entry_count > MAX_ENTRY_COUNT)) {
3992 return EINVAL;
316670eb 3993 }
316670eb 3994
39236c6e
A
3995 entries = (memorystatus_priority_properties_t *)kalloc(buffer_size);
3996
3997 error = copyin(buffer, entries, buffer_size);
316670eb 3998
39236c6e
A
3999 for (i = 0; i < entry_count; i++) {
4000 proc_t p;
4001
4002 if (error) {
4003 break;
4004 }
4005
4006 p = proc_find(pid);
4007 if (!p) {
4008 error = ESRCH;
4009 break;
4010 }
4011
4012 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
4013 error = EPERM;
4014 proc_rele(p);
4015 break;
4016 }
fe8ab488
A
4017
4018 error = memorystatus_update(p, entries[i].priority, entries[i].user_data, FALSE, FALSE, 0, 0, FALSE);
39236c6e
A
4019 proc_rele(p);
4020 }
4021
4022 kfree(entries, buffer_size);
4023
4024 return error;
b0d623f7
A
4025}
4026
39236c6e
A
4027static int
4028memorystatus_cmd_get_pressure_status(int32_t *retval) {
4029 int error;
4030
4031 /* Need privilege for check */
4032 error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
4033 if (error) {
4034 return (error);
4035 }
4036
4037 /* Inherently racy, so it's not worth taking a lock here */
4038 *retval = (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
4039
4040 return error;
4041}
316670eb 4042
fe8ab488
A
4043/*
4044 * Every process, including a P_MEMSTAT_INTERNAL process (currently only pid 1), is allowed to set a HWM.
4045 */
4046
b0d623f7 4047static int
fe8ab488 4048memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit) {
39236c6e
A
4049 int error = 0;
4050
4051 proc_t p = proc_find(pid);
4052 if (!p) {
4053 return ESRCH;
4054 }
4055
4056 if (high_water_mark <= 0) {
4057 high_water_mark = -1; /* Disable */
4058 }
4059
4060 proc_list_lock();
4061
39236c6e
A
4062 p->p_memstat_memlimit = high_water_mark;
4063 if (memorystatus_highwater_enabled) {
4064 if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_BACKGROUND) {
fe8ab488
A
4065
4066 memorystatus_update_priority_locked(p, p->p_memstat_effectivepriority, false);
4067
4068 /*
4069 * The update priority call above takes care to set/reset the fatal memory limit state
4070 * IF the process is transitioning between foreground <-> background and has a background
4071 * memory limit.
4072 * Here, however, the process won't be doing any such transitions and so we explicitly tackle
4073 * the fatal limit state.
4074 */
4075 is_fatal_limit = FALSE;
4076
39236c6e
A
4077 } else {
4078 error = (task_set_phys_footprint_limit_internal(p->task, high_water_mark, NULL, TRUE) == 0) ? 0 : EINVAL;
4079 }
4080 }
4081
fe8ab488
A
4082 if (error == 0) {
4083 if (is_fatal_limit == TRUE) {
4084 p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
4085 } else {
4086 p->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT;
4087 }
4088 }
4089
39236c6e
A
4090 proc_list_unlock();
4091 proc_rele(p);
4092
4093 return error;
4094}
4095
fe8ab488
A
4096/*
4097 * Returns the jetsam priority (effective or requested) of the process
4098 * associated with this task.
4099 */
4100int
4101proc_get_memstat_priority(proc_t p, boolean_t effective_priority)
4102{
4103 if (p) {
4104 if (effective_priority) {
4105 return p->p_memstat_effectivepriority;
4106 } else {
4107 return p->p_memstat_requestedpriority;
4108 }
4109 }
4110 return 0;
4111}
39236c6e 4112#endif /* CONFIG_JETSAM */
b0d623f7 4113
39236c6e
A
4114int
4115memorystatus_control(struct proc *p __unused, struct memorystatus_control_args *args, int *ret) {
4116 int error = EINVAL;
4117
4118#if !CONFIG_JETSAM
4119 #pragma unused(ret)
4120#endif
4121
4122 /* Root only for now */
4123 if (!kauth_cred_issuser(kauth_cred_get())) {
4124 error = EPERM;
4125 goto out;
b0d623f7 4126 }
39236c6e
A
4127
4128 /* Sanity check */
4129 if (args->buffersize > MEMORYSTATUS_BUFFERSIZE_MAX) {
4130 error = EINVAL;
4131 goto out;
4132 }
4133
4134 switch (args->command) {
4135 case MEMORYSTATUS_CMD_GET_PRIORITY_LIST:
4136 error = memorystatus_cmd_get_priority_list(args->buffer, args->buffersize, ret);
4137 break;
4138#if CONFIG_JETSAM
4139 case MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES:
4140 error = memorystatus_cmd_set_priority_properties(args->pid, args->buffer, args->buffersize, ret);
4141 break;
fe8ab488
A
4142 case MEMORYSTATUS_CMD_GRP_SET_PROPERTIES:
4143 error = memorystatus_cmd_grp_set_properties((int32_t)args->flags, args->buffer, args->buffersize, ret);
4144 break;
39236c6e
A
4145 case MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT:
4146 error = memorystatus_cmd_get_jetsam_snapshot(args->buffer, args->buffersize, ret);
4147 break;
4148 case MEMORYSTATUS_CMD_GET_PRESSURE_STATUS:
4149 error = memorystatus_cmd_get_pressure_status(ret);
4150 break;
4151 case MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK:
fe8ab488
A
4152 error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, FALSE);
4153 break;
4154 case MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT:
4155 error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, TRUE);
39236c6e
A
4156 break;
4157 /* Test commands */
4158#if DEVELOPMENT || DEBUG
4159 case MEMORYSTATUS_CMD_TEST_JETSAM:
4160 error = memorystatus_kill_process_sync(args->pid, kMemorystatusKilled) ? 0 : EINVAL;
4161 break;
4162 case MEMORYSTATUS_CMD_SET_JETSAM_PANIC_BITS:
4163 error = memorystatus_cmd_set_panic_bits(args->buffer, args->buffersize);
4164 break;
4165#endif /* DEVELOPMENT || DEBUG */
4166#endif /* CONFIG_JETSAM */
4167 default:
4168 break;
4169 }
4170
4171out:
4172 return error;
4173}
4174
4175
4176static int
4177filt_memorystatusattach(struct knote *kn)
4178{
4179 kn->kn_flags |= EV_CLEAR;
4180 return memorystatus_knote_register(kn);
4181}
4182
4183static void
4184filt_memorystatusdetach(struct knote *kn)
4185{
4186 memorystatus_knote_unregister(kn);
4187}
4188
4189static int
4190filt_memorystatus(struct knote *kn __unused, long hint)
4191{
4192 if (hint) {
4193 switch (hint) {
4194 case kMemorystatusNoPressure:
4195 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
4196 kn->kn_fflags |= NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
4197 }
4198 break;
4199 case kMemorystatusPressure:
4200 if (memorystatus_vm_pressure_level == kVMPressureWarning || memorystatus_vm_pressure_level == kVMPressureUrgent) {
4201 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
4202 kn->kn_fflags |= NOTE_MEMORYSTATUS_PRESSURE_WARN;
4203 }
4204 } else if (memorystatus_vm_pressure_level == kVMPressureCritical) {
4205
4206 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
4207 kn->kn_fflags |= NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
4208 }
4209 }
4210 break;
fe8ab488
A
4211 case kMemorystatusLowSwap:
4212 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) {
4213 kn->kn_fflags |= NOTE_MEMORYSTATUS_LOW_SWAP;
4214 }
4215 break;
39236c6e
A
4216 default:
4217 break;
b0d623f7 4218 }
39236c6e
A
4219 }
4220
4221 return (kn->kn_fflags != 0);
4222}
4223
4224static void
4225memorystatus_klist_lock(void) {
4226 lck_mtx_lock(&memorystatus_klist_mutex);
4227}
4228
4229static void
4230memorystatus_klist_unlock(void) {
4231 lck_mtx_unlock(&memorystatus_klist_mutex);
4232}
4233
4234void
4235memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr) {
4236 lck_mtx_init(&memorystatus_klist_mutex, grp, attr);
4237 klist_init(&memorystatus_klist);
4238}
4239
4240int
4241memorystatus_knote_register(struct knote *kn) {
4242 int error = 0;
4243
4244 memorystatus_klist_lock();
4245
fe8ab488 4246 if (kn->kn_sfflags & (NOTE_MEMORYSTATUS_PRESSURE_NORMAL | NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL | NOTE_MEMORYSTATUS_LOW_SWAP)) {
39236c6e 4247
fe8ab488
A
4248 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) {
4249 error = suser(kauth_cred_get(), 0);
4250 }
39236c6e 4251
fe8ab488 4252 if (error == 0) {
39236c6e 4253 KNOTE_ATTACH(&memorystatus_klist, kn);
b0d623f7 4254 }
39236c6e
A
4255 } else {
4256 error = ENOTSUP;
b0d623f7 4257 }
39236c6e
A
4258
4259 memorystatus_klist_unlock();
4260
4261 return error;
b0d623f7
A
4262}
4263
39236c6e
A
4264void
4265memorystatus_knote_unregister(struct knote *kn __unused) {
4266 memorystatus_klist_lock();
4267 KNOTE_DETACH(&memorystatus_klist, kn);
4268 memorystatus_klist_unlock();
4269}
316670eb 4270
fe8ab488
A
4271
4272#if 0
39236c6e
A
4273#if CONFIG_JETSAM && VM_PRESSURE_EVENTS
4274static boolean_t
4275memorystatus_issue_pressure_kevent(boolean_t pressured) {
4276 memorystatus_klist_lock();
4277 KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure);
4278 memorystatus_klist_unlock();
4279 return TRUE;
4280}
39236c6e 4281#endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
fe8ab488 4282#endif /* 0 */