]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/kern_memorystatus.c
3d8198474c756038c6d9f7dc331ee95609baa0d0
[apple/xnu.git] / bsd / kern / kern_memorystatus.c
1 /*
2 * Copyright (c) 2006-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
29
30 #include <kern/sched_prim.h>
31 #include <kern/kalloc.h>
32 #include <kern/assert.h>
33 #include <kern/debug.h>
34 #include <kern/locks.h>
35 #include <kern/task.h>
36 #include <kern/thread.h>
37 #include <kern/host.h>
38 #include <kern/policy_internal.h>
39 #include <kern/thread_group.h>
40
41 #include <corpses/task_corpse.h>
42 #include <libkern/libkern.h>
43 #include <mach/coalition.h>
44 #include <mach/mach_time.h>
45 #include <mach/task.h>
46 #include <mach/host_priv.h>
47 #include <mach/mach_host.h>
48 #include <os/log.h>
49 #include <pexpert/pexpert.h>
50 #include <sys/coalition.h>
51 #include <sys/kern_event.h>
52 #include <sys/proc.h>
53 #include <sys/proc_info.h>
54 #include <sys/reason.h>
55 #include <sys/signal.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/sysproto.h>
59 #include <sys/wait.h>
60 #include <sys/tree.h>
61 #include <sys/priv.h>
62 #include <vm/pmap.h>
63 #include <vm/vm_pageout.h>
64 #include <vm/vm_protos.h>
65 #include <mach/machine/sdt.h>
66 #include <libkern/section_keywords.h>
67 #include <stdatomic.h>
68
69 #include <IOKit/IOBSD.h>
70
71 #if CONFIG_FREEZE
72 #include <vm/vm_map.h>
73 #endif /* CONFIG_FREEZE */
74
75 #include <sys/kern_memorystatus.h>
76 #include <sys/kern_memorystatus_freeze.h>
77 #include <sys/kern_memorystatus_notify.h>
78
79 /* For logging clarity */
80 static const char *memorystatus_kill_cause_name[] = {
81 "", /* kMemorystatusInvalid */
82 "jettisoned", /* kMemorystatusKilled */
83 "highwater", /* kMemorystatusKilledHiwat */
84 "vnode-limit", /* kMemorystatusKilledVnodes */
85 "vm-pageshortage", /* kMemorystatusKilledVMPageShortage */
86 "proc-thrashing", /* kMemorystatusKilledProcThrashing */
87 "fc-thrashing", /* kMemorystatusKilledFCThrashing */
88 "per-process-limit", /* kMemorystatusKilledPerProcessLimit */
89 "disk-space-shortage", /* kMemorystatusKilledDiskSpaceShortage */
90 "idle-exit", /* kMemorystatusKilledIdleExit */
91 "zone-map-exhaustion", /* kMemorystatusKilledZoneMapExhaustion */
92 "vm-compressor-thrashing", /* kMemorystatusKilledVMCompressorThrashing */
93 "vm-compressor-space-shortage", /* kMemorystatusKilledVMCompressorSpaceShortage */
94 };
95
96 static const char *
97 memorystatus_priority_band_name(int32_t priority)
98 {
99 switch (priority) {
100 case JETSAM_PRIORITY_FOREGROUND:
101 return "FOREGROUND";
102 case JETSAM_PRIORITY_AUDIO_AND_ACCESSORY:
103 return "AUDIO_AND_ACCESSORY";
104 case JETSAM_PRIORITY_CONDUCTOR:
105 return "CONDUCTOR";
106 case JETSAM_PRIORITY_DRIVER_APPLE:
107 return "DRIVER_APPLE";
108 case JETSAM_PRIORITY_HOME:
109 return "HOME";
110 case JETSAM_PRIORITY_EXECUTIVE:
111 return "EXECUTIVE";
112 case JETSAM_PRIORITY_IMPORTANT:
113 return "IMPORTANT";
114 case JETSAM_PRIORITY_CRITICAL:
115 return "CRITICAL";
116 }
117
118 return "?";
119 }
120
121 /* Does cause indicate vm or fc thrashing? */
122 static boolean_t
123 is_reason_thrashing(unsigned cause)
124 {
125 switch (cause) {
126 case kMemorystatusKilledFCThrashing:
127 case kMemorystatusKilledVMCompressorThrashing:
128 case kMemorystatusKilledVMCompressorSpaceShortage:
129 return TRUE;
130 default:
131 return FALSE;
132 }
133 }
134
135 /* Is the zone map almost full? */
136 static boolean_t
137 is_reason_zone_map_exhaustion(unsigned cause)
138 {
139 if (cause == kMemorystatusKilledZoneMapExhaustion) {
140 return TRUE;
141 }
142 return FALSE;
143 }
144
145 /*
146 * Returns the current zone map size and capacity to include in the jetsam snapshot.
147 * Defined in zalloc.c
148 */
149 extern void get_zone_map_size(uint64_t *current_size, uint64_t *capacity);
150
151 /*
152 * Returns the name of the largest zone and its size to include in the jetsam snapshot.
153 * Defined in zalloc.c
154 */
155 extern void get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size);
156
157 /*
158 * Active / Inactive limit support
159 * proc list must be locked
160 *
161 * The SET_*** macros are used to initialize a limit
162 * for the first time.
163 *
164 * The CACHE_*** macros are use to cache the limit that will
165 * soon be in effect down in the ledgers.
166 */
167
168 #define SET_ACTIVE_LIMITS_LOCKED(p, limit, is_fatal) \
169 MACRO_BEGIN \
170 (p)->p_memstat_memlimit_active = (limit); \
171 if (is_fatal) { \
172 (p)->p_memstat_state |= P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL; \
173 } else { \
174 (p)->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL; \
175 } \
176 MACRO_END
177
178 #define SET_INACTIVE_LIMITS_LOCKED(p, limit, is_fatal) \
179 MACRO_BEGIN \
180 (p)->p_memstat_memlimit_inactive = (limit); \
181 if (is_fatal) { \
182 (p)->p_memstat_state |= P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL; \
183 } else { \
184 (p)->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL; \
185 } \
186 MACRO_END
187
188 #define CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal) \
189 MACRO_BEGIN \
190 (p)->p_memstat_memlimit = (p)->p_memstat_memlimit_active; \
191 if ((p)->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL) { \
192 (p)->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT; \
193 is_fatal = TRUE; \
194 } else { \
195 (p)->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT; \
196 is_fatal = FALSE; \
197 } \
198 MACRO_END
199
200 #define CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal) \
201 MACRO_BEGIN \
202 (p)->p_memstat_memlimit = (p)->p_memstat_memlimit_inactive; \
203 if ((p)->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) { \
204 (p)->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT; \
205 is_fatal = TRUE; \
206 } else { \
207 (p)->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT; \
208 is_fatal = FALSE; \
209 } \
210 MACRO_END
211
212
213 /* General tunables */
214
215 unsigned long delta_percentage = 5;
216 unsigned long critical_threshold_percentage = 5;
217 // On embedded devices with more than 3GB of memory we lower the critical percentage.
218 uint64_t config_jetsam_large_memory_cutoff = 3UL * (1UL << 30);
219 unsigned long critical_threshold_percentage_larger_devices = 4;
220 unsigned long delta_percentage_larger_devices = 4;
221 unsigned long idle_offset_percentage = 5;
222 unsigned long pressure_threshold_percentage = 15;
223 unsigned long policy_more_free_offset_percentage = 5;
224 unsigned long sysproc_aging_aggr_threshold_percentage = 7;
225
226 /*
227 * default jetsam snapshot support
228 */
229 memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot;
230 memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot_copy;
231
232 #if CONFIG_FREEZE
233 memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot_freezer;
234 /*
235 * The size of the freezer snapshot is given by memorystatus_jetsam_snapshot_max / JETSAM_SNAPSHOT_FREEZER_MAX_FACTOR
236 * The freezer snapshot can be much smaller than the default snapshot
237 * because it only includes apps that have been killed and dasd consumes it every 30 minutes.
238 * Since the snapshots are always wired we don't want to overallocate too much.
239 */
240 #define JETSAM_SNAPSHOT_FREEZER_MAX_FACTOR 20
241 unsigned int memorystatus_jetsam_snapshot_freezer_max;
242 unsigned int memorystatus_jetsam_snapshot_freezer_size;
243 TUNABLE(bool, memorystatus_jetsam_use_freezer_snapshot, "kern.jetsam_user_freezer_snapshot", true);
244 #endif /* CONFIG_FREEZE */
245
246 unsigned int memorystatus_jetsam_snapshot_count = 0;
247 unsigned int memorystatus_jetsam_snapshot_copy_count = 0;
248 unsigned int memorystatus_jetsam_snapshot_max = 0;
249 unsigned int memorystatus_jetsam_snapshot_size = 0;
250 uint64_t memorystatus_jetsam_snapshot_last_timestamp = 0;
251 uint64_t memorystatus_jetsam_snapshot_timeout = 0;
252
253 #if DEVELOPMENT || DEBUG
254 /*
255 * On development and debug kernels, we allow one pid to take ownership
256 * of the memorystatus snapshot (via memorystatus_control).
257 * If there's an owner, then only they may consume the snapshot.
258 * This is used when testing the snapshot interface to avoid racing with other
259 * processes on the system that consume snapshots.
260 */
261 static pid_t memorystatus_snapshot_owner = 0;
262 SYSCTL_INT(_kern, OID_AUTO, memorystatus_snapshot_owner, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_snapshot_owner, 0, "");
263 #endif /* DEVELOPMENT || DEBUG */
264 static void memorystatus_init_jetsam_snapshot_header(memorystatus_jetsam_snapshot_t *snapshot);
265
266 /* General memorystatus stuff */
267
268 uint64_t memorystatus_sysprocs_idle_delay_time = 0;
269 uint64_t memorystatus_apps_idle_delay_time = 0;
270 /* Some devices give entitled apps a higher memory limit */
271 #if __arm64__
272 int32_t memorystatus_entitled_max_task_footprint_mb = 0;
273
274 #if DEVELOPMENT || DEBUG
275 SYSCTL_INT(_kern, OID_AUTO, entitled_max_task_pmem, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_entitled_max_task_footprint_mb, 0, "");
276 #endif /* DEVELOPMENT || DEBUG */
277 #endif /* __arm64__ */
278
279 static lck_grp_attr_t *memorystatus_jetsam_fg_band_lock_grp_attr;
280 static lck_grp_t *memorystatus_jetsam_fg_band_lock_grp;
281 lck_mtx_t memorystatus_jetsam_fg_band_lock;
282
283 /* Idle guard handling */
284
285 static int32_t memorystatus_scheduled_idle_demotions_sysprocs = 0;
286 static int32_t memorystatus_scheduled_idle_demotions_apps = 0;
287
288 static void memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2);
289 static void memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state);
290 static void memorystatus_reschedule_idle_demotion_locked(void);
291 int memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap);
292 vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
293 boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
294 void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
295 void memorystatus_send_low_swap_note(void);
296 int memorystatus_get_proccnt_upto_priority(int32_t max_bucket_index);
297 boolean_t memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count,
298 uint32_t *errors, uint64_t *memory_reclaimed);
299 uint64_t memorystatus_available_memory_internal(proc_t p);
300
301 unsigned int memorystatus_level = 0;
302 static int memorystatus_list_count = 0;
303 memstat_bucket_t memstat_bucket[MEMSTAT_BUCKET_COUNT];
304 static thread_call_t memorystatus_idle_demotion_call;
305 uint64_t memstat_idle_demotion_deadline = 0;
306 int system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
307 int applications_aging_band = JETSAM_PRIORITY_IDLE;
308
309 #define isProcessInAgingBands(p) ((isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) || (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)))
310
311 #define kJetsamAgingPolicyNone (0)
312 #define kJetsamAgingPolicyLegacy (1)
313 #define kJetsamAgingPolicySysProcsReclaimedFirst (2)
314 #define kJetsamAgingPolicyAppsReclaimedFirst (3)
315 #define kJetsamAgingPolicyMax kJetsamAgingPolicyAppsReclaimedFirst
316
317 unsigned int jetsam_aging_policy = kJetsamAgingPolicySysProcsReclaimedFirst;
318
319 extern int corpse_for_fatal_memkill;
320 extern uint64_t vm_purgeable_purge_task_owned(task_t task);
321 boolean_t memorystatus_allowed_vm_map_fork(task_t);
322 #if DEVELOPMENT || DEBUG
323 void memorystatus_abort_vm_map_fork(task_t);
324 #endif
325
326 /*
327 * Idle delay timeout factors for daemons based on relaunch behavior. Only used in
328 * kJetsamAgingPolicySysProcsReclaimedFirst aging policy.
329 */
330 #define kJetsamSysProcsIdleDelayTimeLowRatio (5)
331 #define kJetsamSysProcsIdleDelayTimeMedRatio (2)
332 #define kJetsamSysProcsIdleDelayTimeHighRatio (1)
333 static_assert(kJetsamSysProcsIdleDelayTimeLowRatio <= DEFERRED_IDLE_EXIT_TIME_SECS, "sysproc idle delay time for low relaunch daemons would be 0");
334
335 /*
336 * For the kJetsamAgingPolicySysProcsReclaimedFirst aging policy, treat apps as well
337 * behaved daemons for aging purposes.
338 */
339 #define kJetsamAppsIdleDelayTimeRatio (kJetsamSysProcsIdleDelayTimeLowRatio)
340
341 static uint64_t
342 memorystatus_sysprocs_idle_time(proc_t p)
343 {
344 /*
345 * The kJetsamAgingPolicySysProcsReclaimedFirst aging policy uses the relaunch behavior to
346 * determine the exact idle deferred time provided to the daemons. For all other aging
347 * policies, simply return the default aging idle time.
348 */
349 if (jetsam_aging_policy != kJetsamAgingPolicySysProcsReclaimedFirst) {
350 return memorystatus_sysprocs_idle_delay_time;
351 }
352
353 uint64_t idle_delay_time = 0;
354 /*
355 * For system processes, base the idle delay time on the
356 * jetsam relaunch behavior specified by launchd. The idea
357 * is to provide extra protection to the daemons which would
358 * relaunch immediately after jetsam.
359 */
360 switch (p->p_memstat_relaunch_flags) {
361 case P_MEMSTAT_RELAUNCH_UNKNOWN:
362 case P_MEMSTAT_RELAUNCH_LOW:
363 idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeLowRatio;
364 break;
365 case P_MEMSTAT_RELAUNCH_MED:
366 idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeMedRatio;
367 break;
368 case P_MEMSTAT_RELAUNCH_HIGH:
369 idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeHighRatio;
370 break;
371 default:
372 panic("Unknown relaunch flags on process!");
373 break;
374 }
375 return idle_delay_time;
376 }
377
378 static uint64_t
379 memorystatus_apps_idle_time(__unused proc_t p)
380 {
381 /*
382 * For kJetsamAgingPolicySysProcsReclaimedFirst, the Apps are considered as low
383 * relaunch candidates. So only provide limited protection to them. In the other
384 * aging policies, return the default aging idle time.
385 */
386 if (jetsam_aging_policy != kJetsamAgingPolicySysProcsReclaimedFirst) {
387 return memorystatus_apps_idle_delay_time;
388 }
389
390 return memorystatus_apps_idle_delay_time / kJetsamAppsIdleDelayTimeRatio;
391 }
392
393
394 #if 0
395
396 /* Keeping around for future use if we need a utility that can do this OR an app that needs a dynamic adjustment. */
397
398 static int
399 sysctl_set_jetsam_aging_policy SYSCTL_HANDLER_ARGS
400 {
401 #pragma unused(oidp, arg1, arg2)
402
403 int error = 0, val = 0;
404 memstat_bucket_t *old_bucket = 0;
405 int old_system_procs_aging_band = 0, new_system_procs_aging_band = 0;
406 int old_applications_aging_band = 0, new_applications_aging_band = 0;
407 proc_t p = NULL, next_proc = NULL;
408
409
410 error = sysctl_io_number(req, jetsam_aging_policy, sizeof(int), &val, NULL);
411 if (error || !req->newptr) {
412 return error;
413 }
414
415 if ((val < 0) || (val > kJetsamAgingPolicyMax)) {
416 printf("jetsam: ordering policy sysctl has invalid value - %d\n", val);
417 return EINVAL;
418 }
419
420 /*
421 * We need to synchronize with any potential adding/removal from aging bands
422 * that might be in progress currently. We use the proc_list_lock() just for
423 * consistency with all the routines dealing with 'aging' processes. We need
424 * a lighterweight lock.
425 */
426 proc_list_lock();
427
428 old_system_procs_aging_band = system_procs_aging_band;
429 old_applications_aging_band = applications_aging_band;
430
431 switch (val) {
432 case kJetsamAgingPolicyNone:
433 new_system_procs_aging_band = JETSAM_PRIORITY_IDLE;
434 new_applications_aging_band = JETSAM_PRIORITY_IDLE;
435 break;
436
437 case kJetsamAgingPolicyLegacy:
438 /*
439 * Legacy behavior where some daemons get a 10s protection once and only before the first clean->dirty->clean transition before going into IDLE band.
440 */
441 new_system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
442 new_applications_aging_band = JETSAM_PRIORITY_IDLE;
443 break;
444
445 case kJetsamAgingPolicySysProcsReclaimedFirst:
446 new_system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
447 new_applications_aging_band = JETSAM_PRIORITY_AGING_BAND2;
448 break;
449
450 case kJetsamAgingPolicyAppsReclaimedFirst:
451 new_system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND2;
452 new_applications_aging_band = JETSAM_PRIORITY_AGING_BAND1;
453 break;
454
455 default:
456 break;
457 }
458
459 if (old_system_procs_aging_band && (old_system_procs_aging_band != new_system_procs_aging_band)) {
460 old_bucket = &memstat_bucket[old_system_procs_aging_band];
461 p = TAILQ_FIRST(&old_bucket->list);
462
463 while (p) {
464 next_proc = TAILQ_NEXT(p, p_memstat_list);
465
466 if (isSysProc(p)) {
467 if (new_system_procs_aging_band == JETSAM_PRIORITY_IDLE) {
468 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
469 }
470
471 memorystatus_update_priority_locked(p, new_system_procs_aging_band, false, true);
472 }
473
474 p = next_proc;
475 continue;
476 }
477 }
478
479 if (old_applications_aging_band && (old_applications_aging_band != new_applications_aging_band)) {
480 old_bucket = &memstat_bucket[old_applications_aging_band];
481 p = TAILQ_FIRST(&old_bucket->list);
482
483 while (p) {
484 next_proc = TAILQ_NEXT(p, p_memstat_list);
485
486 if (isApp(p)) {
487 if (new_applications_aging_band == JETSAM_PRIORITY_IDLE) {
488 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
489 }
490
491 memorystatus_update_priority_locked(p, new_applications_aging_band, false, true);
492 }
493
494 p = next_proc;
495 continue;
496 }
497 }
498
499 jetsam_aging_policy = val;
500 system_procs_aging_band = new_system_procs_aging_band;
501 applications_aging_band = new_applications_aging_band;
502
503 proc_list_unlock();
504
505 return 0;
506 }
507
508 SYSCTL_PROC(_kern, OID_AUTO, set_jetsam_aging_policy, CTLTYPE_INT | CTLFLAG_RW,
509 0, 0, sysctl_set_jetsam_aging_policy, "I", "Jetsam Aging Policy");
510 #endif /*0*/
511
512 static int
513 sysctl_jetsam_set_sysprocs_idle_delay_time SYSCTL_HANDLER_ARGS
514 {
515 #pragma unused(oidp, arg1, arg2)
516
517 int error = 0, val = 0, old_time_in_secs = 0;
518 uint64_t old_time_in_ns = 0;
519
520 absolutetime_to_nanoseconds(memorystatus_sysprocs_idle_delay_time, &old_time_in_ns);
521 old_time_in_secs = (int) (old_time_in_ns / NSEC_PER_SEC);
522
523 error = sysctl_io_number(req, old_time_in_secs, sizeof(int), &val, NULL);
524 if (error || !req->newptr) {
525 return error;
526 }
527
528 if ((val < 0) || (val > INT32_MAX)) {
529 printf("jetsam: new idle delay interval has invalid value.\n");
530 return EINVAL;
531 }
532
533 nanoseconds_to_absolutetime((uint64_t)val * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time);
534
535 return 0;
536 }
537
538 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_sysprocs_idle_delay_time, CTLTYPE_INT | CTLFLAG_RW,
539 0, 0, sysctl_jetsam_set_sysprocs_idle_delay_time, "I", "Aging window for system processes");
540
541
542 static int
543 sysctl_jetsam_set_apps_idle_delay_time SYSCTL_HANDLER_ARGS
544 {
545 #pragma unused(oidp, arg1, arg2)
546
547 int error = 0, val = 0, old_time_in_secs = 0;
548 uint64_t old_time_in_ns = 0;
549
550 absolutetime_to_nanoseconds(memorystatus_apps_idle_delay_time, &old_time_in_ns);
551 old_time_in_secs = (int) (old_time_in_ns / NSEC_PER_SEC);
552
553 error = sysctl_io_number(req, old_time_in_secs, sizeof(int), &val, NULL);
554 if (error || !req->newptr) {
555 return error;
556 }
557
558 if ((val < 0) || (val > INT32_MAX)) {
559 printf("jetsam: new idle delay interval has invalid value.\n");
560 return EINVAL;
561 }
562
563 nanoseconds_to_absolutetime((uint64_t)val * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time);
564
565 return 0;
566 }
567
568 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_apps_idle_delay_time, CTLTYPE_INT | CTLFLAG_RW,
569 0, 0, sysctl_jetsam_set_apps_idle_delay_time, "I", "Aging window for applications");
570
571 SYSCTL_INT(_kern, OID_AUTO, jetsam_aging_policy, CTLTYPE_INT | CTLFLAG_RD, &jetsam_aging_policy, 0, "");
572
573 static unsigned int memorystatus_dirty_count = 0;
574
575 SYSCTL_INT(_kern, OID_AUTO, max_task_pmem, CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED, &max_task_footprint_mb, 0, "");
576
577 static int memorystatus_highwater_enabled = 1; /* Update the cached memlimit data. */
578 static boolean_t proc_jetsam_state_is_active_locked(proc_t);
579
580 #if __arm64__
581 int legacy_footprint_bonus_mb = 50; /* This value was chosen after looking at the top 30 apps
582 * that needed the additional room in their footprint when
583 * the 'correct' accounting methods were applied to them.
584 */
585
586 #if DEVELOPMENT || DEBUG
587 SYSCTL_INT(_kern, OID_AUTO, legacy_footprint_bonus_mb, CTLFLAG_RW | CTLFLAG_LOCKED, &legacy_footprint_bonus_mb, 0, "");
588 #endif /* DEVELOPMENT || DEBUG */
589 /*
590 * Raise the inactive and active memory limits to new values.
591 * Will only raise the limits and will do nothing if either of the current
592 * limits are 0.
593 * Caller must hold the proc_list_lock
594 */
595 static void
596 memorystatus_raise_memlimit(proc_t p, int new_memlimit_active, int new_memlimit_inactive)
597 {
598 int memlimit_mb_active = 0, memlimit_mb_inactive = 0;
599 boolean_t memlimit_active_is_fatal = FALSE, memlimit_inactive_is_fatal = FALSE, use_active_limit = FALSE;
600
601 LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
602
603 if (p->p_memstat_memlimit_active > 0) {
604 memlimit_mb_active = p->p_memstat_memlimit_active;
605 } else if (p->p_memstat_memlimit_active == -1) {
606 memlimit_mb_active = max_task_footprint_mb;
607 } else {
608 /*
609 * Nothing to do for '0' which is
610 * a special value only used internally
611 * to test 'no limits'.
612 */
613 return;
614 }
615
616 if (p->p_memstat_memlimit_inactive > 0) {
617 memlimit_mb_inactive = p->p_memstat_memlimit_inactive;
618 } else if (p->p_memstat_memlimit_inactive == -1) {
619 memlimit_mb_inactive = max_task_footprint_mb;
620 } else {
621 /*
622 * Nothing to do for '0' which is
623 * a special value only used internally
624 * to test 'no limits'.
625 */
626 return;
627 }
628
629 memlimit_mb_active = MAX(new_memlimit_active, memlimit_mb_active);
630 memlimit_mb_inactive = MAX(new_memlimit_inactive, memlimit_mb_inactive);
631
632 memlimit_active_is_fatal = (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL);
633 memlimit_inactive_is_fatal = (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL);
634
635 SET_ACTIVE_LIMITS_LOCKED(p, memlimit_mb_active, memlimit_active_is_fatal);
636 SET_INACTIVE_LIMITS_LOCKED(p, memlimit_mb_inactive, memlimit_inactive_is_fatal);
637
638 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
639 use_active_limit = TRUE;
640 CACHE_ACTIVE_LIMITS_LOCKED(p, memlimit_active_is_fatal);
641 } else {
642 CACHE_INACTIVE_LIMITS_LOCKED(p, memlimit_inactive_is_fatal);
643 }
644
645 if (memorystatus_highwater_enabled) {
646 task_set_phys_footprint_limit_internal(p->task,
647 (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1,
648 NULL, /*return old value */
649 use_active_limit, /*active limit?*/
650 (use_active_limit ? memlimit_active_is_fatal : memlimit_inactive_is_fatal));
651 }
652 }
653
654 void
655 memorystatus_act_on_legacy_footprint_entitlement(proc_t p, boolean_t footprint_increase)
656 {
657 int memlimit_mb_active = 0, memlimit_mb_inactive = 0;
658
659 if (p == NULL) {
660 return;
661 }
662
663 proc_list_lock();
664
665 if (p->p_memstat_memlimit_active > 0) {
666 memlimit_mb_active = p->p_memstat_memlimit_active;
667 } else if (p->p_memstat_memlimit_active == -1) {
668 memlimit_mb_active = max_task_footprint_mb;
669 } else {
670 /*
671 * Nothing to do for '0' which is
672 * a special value only used internally
673 * to test 'no limits'.
674 */
675 proc_list_unlock();
676 return;
677 }
678
679 if (p->p_memstat_memlimit_inactive > 0) {
680 memlimit_mb_inactive = p->p_memstat_memlimit_inactive;
681 } else if (p->p_memstat_memlimit_inactive == -1) {
682 memlimit_mb_inactive = max_task_footprint_mb;
683 } else {
684 /*
685 * Nothing to do for '0' which is
686 * a special value only used internally
687 * to test 'no limits'.
688 */
689 proc_list_unlock();
690 return;
691 }
692
693 if (footprint_increase) {
694 memlimit_mb_active += legacy_footprint_bonus_mb;
695 memlimit_mb_inactive += legacy_footprint_bonus_mb;
696 } else {
697 memlimit_mb_active -= legacy_footprint_bonus_mb;
698 if (memlimit_mb_active == max_task_footprint_mb) {
699 memlimit_mb_active = -1; /* reverting back to default system limit */
700 }
701
702 memlimit_mb_inactive -= legacy_footprint_bonus_mb;
703 if (memlimit_mb_inactive == max_task_footprint_mb) {
704 memlimit_mb_inactive = -1; /* reverting back to default system limit */
705 }
706 }
707 memorystatus_raise_memlimit(p, memlimit_mb_active, memlimit_mb_inactive);
708
709 proc_list_unlock();
710 }
711
712 void
713 memorystatus_act_on_ios13extended_footprint_entitlement(proc_t p)
714 {
715 if (max_mem < 1500ULL * 1024 * 1024 ||
716 max_mem > 2ULL * 1024 * 1024 * 1024) {
717 /* ios13extended_footprint is only for 2GB devices */
718 return;
719 }
720 /* limit to "almost 2GB" */
721 proc_list_lock();
722 memorystatus_raise_memlimit(p, 1800, 1800);
723 proc_list_unlock();
724 }
725
726 void
727 memorystatus_act_on_entitled_task_limit(proc_t p)
728 {
729 if (memorystatus_entitled_max_task_footprint_mb == 0) {
730 // Entitlement is not supported on this device.
731 return;
732 }
733 proc_list_lock();
734 memorystatus_raise_memlimit(p, memorystatus_entitled_max_task_footprint_mb, memorystatus_entitled_max_task_footprint_mb);
735 proc_list_unlock();
736 }
737 #endif /* __arm64__ */
738
739 SYSCTL_INT(_kern, OID_AUTO, memorystatus_level, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_level, 0, "");
740
741 int
742 memorystatus_get_level(__unused struct proc *p, struct memorystatus_get_level_args *args, __unused int *ret)
743 {
744 user_addr_t level = 0;
745
746 level = args->level;
747
748 if (copyout(&memorystatus_level, level, sizeof(memorystatus_level)) != 0) {
749 return EFAULT;
750 }
751
752 return 0;
753 }
754
755 static void memorystatus_thread(void *param __unused, wait_result_t wr __unused);
756
757 /* Memory Limits */
758
759 static boolean_t memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason);
760 static boolean_t memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason);
761
762
763 static int memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
764
765 static int memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry);
766
767 static int memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
768
769 static int memorystatus_cmd_get_memlimit_excess_np(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
770
771 static void memorystatus_get_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t *p_entry);
772 static int memorystatus_set_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t *p_entry);
773
774 int proc_get_memstat_priority(proc_t, boolean_t);
775
776 static boolean_t memorystatus_idle_snapshot = 0;
777
778 unsigned int memorystatus_delta = 0;
779
780 /* Jetsam Loop Detection */
781 static boolean_t memorystatus_jld_enabled = FALSE; /* Enable jetsam loop detection */
782 static uint32_t memorystatus_jld_eval_period_msecs = 0; /* Init pass sets this based on device memory size */
783 static int memorystatus_jld_eval_aggressive_count = 3; /* Raise the priority max after 'n' aggressive loops */
784 static int memorystatus_jld_eval_aggressive_priority_band_max = 15; /* Kill aggressively up through this band */
785
786 /*
787 * A FG app can request that the aggressive jetsam mechanism display some leniency in the FG band. This 'lenient' mode is described as:
788 * --- if aggressive jetsam kills an app in the FG band and gets back >=AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD memory, it will stop the aggressive march further into and up the jetsam bands.
789 *
790 * RESTRICTIONS:
791 * - Such a request is respected/acknowledged only once while that 'requesting' app is in the FG band i.e. if aggressive jetsam was
792 * needed and the 'lenient' mode was deployed then that's it for this special mode while the app is in the FG band.
793 *
794 * - If the app is still in the FG band and aggressive jetsam is needed again, there will be no stop-and-check the next time around.
795 *
796 * - Also, the transition of the 'requesting' app away from the FG band will void this special behavior.
797 */
798
799 #define AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD 25
800 boolean_t memorystatus_aggressive_jetsam_lenient_allowed = FALSE;
801 boolean_t memorystatus_aggressive_jetsam_lenient = FALSE;
802
803 #if DEVELOPMENT || DEBUG
804 /*
805 * Jetsam Loop Detection tunables.
806 */
807
808 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_jld_eval_period_msecs, 0, "");
809 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_aggressive_count, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_jld_eval_aggressive_count, 0, "");
810 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_aggressive_priority_band_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_jld_eval_aggressive_priority_band_max, 0, "");
811 #endif /* DEVELOPMENT || DEBUG */
812
813 static uint32_t kill_under_pressure_cause = 0;
814
815 /*
816 * snapshot support for memstats collected at boot.
817 */
818 static memorystatus_jetsam_snapshot_t memorystatus_at_boot_snapshot;
819
820 static void memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count);
821 static boolean_t memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount);
822 static void memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime);
823
824 static void memorystatus_clear_errors(void);
825 static void memorystatus_get_task_phys_footprint_page_counts(task_t task,
826 uint64_t *internal_pages, uint64_t *internal_compressed_pages,
827 uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages,
828 uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages,
829 uint64_t *iokit_mapped_pages, uint64_t *page_table_pages, uint64_t *frozen_to_swap_pages);
830
831 static void memorystatus_get_task_memory_region_count(task_t task, uint64_t *count);
832
833 static uint32_t memorystatus_build_state(proc_t p);
834 //static boolean_t memorystatus_issue_pressure_kevent(boolean_t pressured);
835
836 static boolean_t memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, os_reason_t jetsam_reason, int32_t *priority,
837 uint32_t *errors, uint64_t *memory_reclaimed);
838 static boolean_t memorystatus_kill_processes_aggressive(uint32_t cause, int aggr_count, int32_t priority_max, uint32_t *errors, uint64_t *memory_reclaimed);
839 static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged, uint64_t *memory_reclaimed);
840
841 static boolean_t memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause);
842
843 /* Priority Band Sorting Routines */
844 static int memorystatus_sort_bucket(unsigned int bucket_index, int sort_order);
845 static int memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coal_sort_order);
846 static void memorystatus_sort_by_largest_process_locked(unsigned int bucket_index);
847 static int memorystatus_move_list_locked(unsigned int bucket_index, pid_t *pid_list, int list_sz);
848
849 /* qsort routines */
850 typedef int (*cmpfunc_t)(const void *a, const void *b);
851 extern void qsort(void *a, size_t n, size_t es, cmpfunc_t cmp);
852 static int memstat_asc_cmp(const void *a, const void *b);
853
854 /* VM pressure */
855
856 extern unsigned int vm_page_free_count;
857 extern unsigned int vm_page_active_count;
858 extern unsigned int vm_page_inactive_count;
859 extern unsigned int vm_page_throttled_count;
860 extern unsigned int vm_page_purgeable_count;
861 extern unsigned int vm_page_wire_count;
862 extern unsigned int vm_page_speculative_count;
863
864 #if CONFIG_JETSAM
865 #define MEMORYSTATUS_LOG_AVAILABLE_PAGES memorystatus_available_pages
866 #else /* CONFIG_JETSAM */
867 #define MEMORYSTATUS_LOG_AVAILABLE_PAGES (vm_page_active_count + vm_page_inactive_count + vm_page_free_count + vm_page_speculative_count)
868 #endif /* CONFIG_JETSAM */
869 #if CONFIG_SECLUDED_MEMORY
870 extern unsigned int vm_page_secluded_count;
871 extern unsigned int vm_page_secluded_count_over_target;
872 #endif /* CONFIG_SECLUDED_MEMORY */
873
874 /* Aggressive jetsam pages threshold for sysproc aging policy */
875 unsigned int memorystatus_sysproc_aging_aggr_pages = 0;
876
877 #if CONFIG_JETSAM
878 unsigned int memorystatus_available_pages = (unsigned int)-1;
879 unsigned int memorystatus_available_pages_pressure = 0;
880 unsigned int memorystatus_available_pages_critical = 0;
881 unsigned int memorystatus_available_pages_critical_base = 0;
882 unsigned int memorystatus_available_pages_critical_idle_offset = 0;
883
884 #if DEVELOPMENT || DEBUG
885 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
886 #else
887 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
888 #endif /* DEVELOPMENT || DEBUG */
889
890 static unsigned int memorystatus_jetsam_policy = kPolicyDefault;
891 unsigned int memorystatus_policy_more_free_offset_pages = 0;
892 static void memorystatus_update_levels_locked(boolean_t critical_only);
893 static unsigned int memorystatus_thread_wasted_wakeup = 0;
894
895 /* Callback into vm_compressor.c to signal that thrashing has been mitigated. */
896 extern void vm_thrashing_jetsam_done(void);
897 static int memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit);
898 #if DEVELOPMENT || DEBUG
899 static inline uint32_t
900 roundToNearestMB(uint32_t in)
901 {
902 return (in + ((1 << 20) - 1)) >> 20;
903 }
904
905 static int memorystatus_cmd_increase_jetsam_task_limit(pid_t pid, uint32_t byte_increase);
906 #endif
907
908 int32_t max_kill_priority = JETSAM_PRIORITY_MAX;
909
910 #else /* CONFIG_JETSAM */
911
912 uint64_t memorystatus_available_pages = (uint64_t)-1;
913 uint64_t memorystatus_available_pages_pressure = (uint64_t)-1;
914 uint64_t memorystatus_available_pages_critical = (uint64_t)-1;
915
916 int32_t max_kill_priority = JETSAM_PRIORITY_IDLE;
917 #endif /* CONFIG_JETSAM */
918
919 #if DEVELOPMENT || DEBUG
920
921 lck_grp_attr_t *disconnect_page_mappings_lck_grp_attr;
922 lck_grp_t *disconnect_page_mappings_lck_grp;
923 static lck_mtx_t disconnect_page_mappings_mutex;
924
925 extern bool kill_on_no_paging_space;
926 #endif /* DEVELOPMENT || DEBUG */
927
928
929 /* Debug */
930
931 extern struct knote *vm_find_knote_from_pid(pid_t, struct klist *);
932
933 #if DEVELOPMENT || DEBUG
934
935 static unsigned int memorystatus_debug_dump_this_bucket = 0;
936
937 static void
938 memorystatus_debug_dump_bucket_locked(unsigned int bucket_index)
939 {
940 proc_t p = NULL;
941 uint64_t bytes = 0;
942 int ledger_limit = 0;
943 unsigned int b = bucket_index;
944 boolean_t traverse_all_buckets = FALSE;
945
946 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
947 traverse_all_buckets = TRUE;
948 b = 0;
949 } else {
950 traverse_all_buckets = FALSE;
951 b = bucket_index;
952 }
953
954 /*
955 * footprint reported in [pages / MB ]
956 * limits reported as:
957 * L-limit proc's Ledger limit
958 * C-limit proc's Cached limit, should match Ledger
959 * A-limit proc's Active limit
960 * IA-limit proc's Inactive limit
961 * F==Fatal, NF==NonFatal
962 */
963
964 printf("memorystatus_debug_dump ***START*(PAGE_SIZE_64=%llu)**\n", PAGE_SIZE_64);
965 printf("bucket [pid] [pages / MB] [state] [EP / RP / AP] dirty deadline [L-limit / C-limit / A-limit / IA-limit] name\n");
966 p = memorystatus_get_first_proc_locked(&b, traverse_all_buckets);
967 while (p) {
968 bytes = get_task_phys_footprint(p->task);
969 task_get_phys_footprint_limit(p->task, &ledger_limit);
970 printf("%2d [%5d] [%5lld /%3lldMB] 0x%-8x [%2d / %2d / %2d] 0x%-3x %10lld [%3d / %3d%s / %3d%s / %3d%s] %s\n",
971 b, p->p_pid,
972 (bytes / PAGE_SIZE_64), /* task's footprint converted from bytes to pages */
973 (bytes / (1024ULL * 1024ULL)), /* task's footprint converted from bytes to MB */
974 p->p_memstat_state, p->p_memstat_effectivepriority, p->p_memstat_requestedpriority, p->p_memstat_assertionpriority,
975 p->p_memstat_dirty, p->p_memstat_idledeadline,
976 ledger_limit,
977 p->p_memstat_memlimit,
978 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"),
979 p->p_memstat_memlimit_active,
980 (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL ? "F " : "NF"),
981 p->p_memstat_memlimit_inactive,
982 (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL ? "F " : "NF"),
983 (*p->p_name ? p->p_name : "unknown"));
984 p = memorystatus_get_next_proc_locked(&b, p, traverse_all_buckets);
985 }
986 printf("memorystatus_debug_dump ***END***\n");
987 }
988
989 static int
990 sysctl_memorystatus_debug_dump_bucket SYSCTL_HANDLER_ARGS
991 {
992 #pragma unused(oidp, arg2)
993 int bucket_index = 0;
994 int error;
995 error = SYSCTL_OUT(req, arg1, sizeof(int));
996 if (error || !req->newptr) {
997 return error;
998 }
999 error = SYSCTL_IN(req, &bucket_index, sizeof(int));
1000 if (error || !req->newptr) {
1001 return error;
1002 }
1003 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1004 /*
1005 * All jetsam buckets will be dumped.
1006 */
1007 } else {
1008 /*
1009 * Only a single bucket will be dumped.
1010 */
1011 }
1012
1013 proc_list_lock();
1014 memorystatus_debug_dump_bucket_locked(bucket_index);
1015 proc_list_unlock();
1016 memorystatus_debug_dump_this_bucket = bucket_index;
1017 return error;
1018 }
1019
1020 /*
1021 * Debug aid to look at jetsam buckets and proc jetsam fields.
1022 * Use this sysctl to act on a particular jetsam bucket.
1023 * Writing the sysctl triggers the dump.
1024 * Usage: sysctl kern.memorystatus_debug_dump_this_bucket=<bucket_index>
1025 */
1026
1027 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_debug_dump_this_bucket, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_debug_dump_this_bucket, 0, sysctl_memorystatus_debug_dump_bucket, "I", "");
1028
1029
1030 /* Debug aid to aid determination of limit */
1031
1032 static int
1033 sysctl_memorystatus_highwater_enable SYSCTL_HANDLER_ARGS
1034 {
1035 #pragma unused(oidp, arg2)
1036 proc_t p;
1037 unsigned int b = 0;
1038 int error, enable = 0;
1039 boolean_t use_active; /* use the active limit and active limit attributes */
1040 boolean_t is_fatal;
1041
1042 error = SYSCTL_OUT(req, arg1, sizeof(int));
1043 if (error || !req->newptr) {
1044 return error;
1045 }
1046
1047 error = SYSCTL_IN(req, &enable, sizeof(int));
1048 if (error || !req->newptr) {
1049 return error;
1050 }
1051
1052 if (!(enable == 0 || enable == 1)) {
1053 return EINVAL;
1054 }
1055
1056 proc_list_lock();
1057
1058 p = memorystatus_get_first_proc_locked(&b, TRUE);
1059 while (p) {
1060 use_active = proc_jetsam_state_is_active_locked(p);
1061
1062 if (enable) {
1063 if (use_active == TRUE) {
1064 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
1065 } else {
1066 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
1067 }
1068 } else {
1069 /*
1070 * Disabling limits does not touch the stored variants.
1071 * Set the cached limit fields to system_wide defaults.
1072 */
1073 p->p_memstat_memlimit = -1;
1074 p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
1075 is_fatal = TRUE;
1076 }
1077
1078 /*
1079 * Enforce the cached limit by writing to the ledger.
1080 */
1081 task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit: -1, NULL, use_active, is_fatal);
1082
1083 p = memorystatus_get_next_proc_locked(&b, p, TRUE);
1084 }
1085
1086 memorystatus_highwater_enabled = enable;
1087
1088 proc_list_unlock();
1089
1090 return 0;
1091 }
1092
1093 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_highwater_enabled, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_highwater_enabled, 0, sysctl_memorystatus_highwater_enable, "I", "");
1094
1095 SYSCTL_INT(_kern, OID_AUTO, memorystatus_idle_snapshot, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_idle_snapshot, 0, "");
1096
1097 #if CONFIG_JETSAM
1098 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_available_pages_critical, 0, "");
1099 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_base, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_available_pages_critical_base, 0, "");
1100 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_idle_offset, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_available_pages_critical_idle_offset, 0, "");
1101 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_policy_more_free_offset_pages, CTLFLAG_RW, &memorystatus_policy_more_free_offset_pages, 0, "");
1102
1103 static unsigned int memorystatus_jetsam_panic_debug = 0;
1104
1105 #if VM_PRESSURE_EVENTS
1106
1107 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_pressure, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_available_pages_pressure, 0, "");
1108
1109 #endif /* VM_PRESSURE_EVENTS */
1110
1111 #endif /* CONFIG_JETSAM */
1112
1113 #endif /* DEVELOPMENT || DEBUG */
1114
1115 extern kern_return_t kernel_thread_start_priority(thread_continue_t continuation,
1116 void *parameter,
1117 integer_t priority,
1118 thread_t *new_thread);
1119
1120 #if DEVELOPMENT || DEBUG
1121
1122 static int
1123 sysctl_memorystatus_disconnect_page_mappings SYSCTL_HANDLER_ARGS
1124 {
1125 #pragma unused(arg1, arg2)
1126 int error = 0, pid = 0;
1127 proc_t p;
1128
1129 error = sysctl_handle_int(oidp, &pid, 0, req);
1130 if (error || !req->newptr) {
1131 return error;
1132 }
1133
1134 lck_mtx_lock(&disconnect_page_mappings_mutex);
1135
1136 if (pid == -1) {
1137 vm_pageout_disconnect_all_pages();
1138 } else {
1139 p = proc_find(pid);
1140
1141 if (p != NULL) {
1142 error = task_disconnect_page_mappings(p->task);
1143
1144 proc_rele(p);
1145
1146 if (error) {
1147 error = EIO;
1148 }
1149 } else {
1150 error = EINVAL;
1151 }
1152 }
1153 lck_mtx_unlock(&disconnect_page_mappings_mutex);
1154
1155 return error;
1156 }
1157
1158 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_disconnect_page_mappings, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1159 0, 0, &sysctl_memorystatus_disconnect_page_mappings, "I", "");
1160
1161 #endif /* DEVELOPMENT || DEBUG */
1162
1163 /*
1164 * Sorts the given bucket.
1165 *
1166 * Input:
1167 * bucket_index - jetsam priority band to be sorted.
1168 * sort_order - JETSAM_SORT_xxx from kern_memorystatus.h
1169 * Currently sort_order is only meaningful when handling
1170 * coalitions.
1171 *
1172 * proc_list_lock must be held by the caller.
1173 */
1174 static void
1175 memorystatus_sort_bucket_locked(unsigned int bucket_index, int sort_order)
1176 {
1177 LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
1178 if (memstat_bucket[bucket_index].count == 0) {
1179 return;
1180 }
1181
1182 switch (bucket_index) {
1183 case JETSAM_PRIORITY_FOREGROUND:
1184 if (memorystatus_sort_by_largest_coalition_locked(bucket_index, sort_order) == 0) {
1185 /*
1186 * Fall back to per process sorting when zero coalitions are found.
1187 */
1188 memorystatus_sort_by_largest_process_locked(bucket_index);
1189 }
1190 break;
1191 default:
1192 memorystatus_sort_by_largest_process_locked(bucket_index);
1193 break;
1194 }
1195 }
1196
1197 /*
1198 * Picks the sorting routine for a given jetsam priority band.
1199 *
1200 * Input:
1201 * bucket_index - jetsam priority band to be sorted.
1202 * sort_order - JETSAM_SORT_xxx from kern_memorystatus.h
1203 * Currently sort_order is only meaningful when handling
1204 * coalitions.
1205 *
1206 * Return:
1207 * 0 on success
1208 * non-0 on failure
1209 */
1210 static int
1211 memorystatus_sort_bucket(unsigned int bucket_index, int sort_order)
1212 {
1213 int coal_sort_order;
1214
1215 /*
1216 * Verify the jetsam priority
1217 */
1218 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1219 return EINVAL;
1220 }
1221
1222 #if DEVELOPMENT || DEBUG
1223 if (sort_order == JETSAM_SORT_DEFAULT) {
1224 coal_sort_order = COALITION_SORT_DEFAULT;
1225 } else {
1226 coal_sort_order = sort_order; /* only used for testing scenarios */
1227 }
1228 #else
1229 /* Verify default */
1230 if (sort_order == JETSAM_SORT_DEFAULT) {
1231 coal_sort_order = COALITION_SORT_DEFAULT;
1232 } else {
1233 return EINVAL;
1234 }
1235 #endif
1236
1237 proc_list_lock();
1238 memorystatus_sort_bucket_locked(bucket_index, coal_sort_order);
1239 proc_list_unlock();
1240
1241 return 0;
1242 }
1243
1244 /*
1245 * Sort processes by size for a single jetsam bucket.
1246 */
1247
1248 static void
1249 memorystatus_sort_by_largest_process_locked(unsigned int bucket_index)
1250 {
1251 proc_t p = NULL, insert_after_proc = NULL, max_proc = NULL;
1252 proc_t next_p = NULL, prev_max_proc = NULL;
1253 uint32_t pages = 0, max_pages = 0;
1254 memstat_bucket_t *current_bucket;
1255
1256 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1257 return;
1258 }
1259
1260 current_bucket = &memstat_bucket[bucket_index];
1261
1262 p = TAILQ_FIRST(&current_bucket->list);
1263
1264 while (p) {
1265 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
1266 max_pages = pages;
1267 max_proc = p;
1268 prev_max_proc = p;
1269
1270 while ((next_p = TAILQ_NEXT(p, p_memstat_list)) != NULL) {
1271 /* traversing list until we find next largest process */
1272 p = next_p;
1273 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
1274 if (pages > max_pages) {
1275 max_pages = pages;
1276 max_proc = p;
1277 }
1278 }
1279
1280 if (prev_max_proc != max_proc) {
1281 /* found a larger process, place it in the list */
1282 TAILQ_REMOVE(&current_bucket->list, max_proc, p_memstat_list);
1283 if (insert_after_proc == NULL) {
1284 TAILQ_INSERT_HEAD(&current_bucket->list, max_proc, p_memstat_list);
1285 } else {
1286 TAILQ_INSERT_AFTER(&current_bucket->list, insert_after_proc, max_proc, p_memstat_list);
1287 }
1288 prev_max_proc = max_proc;
1289 }
1290
1291 insert_after_proc = max_proc;
1292
1293 p = TAILQ_NEXT(max_proc, p_memstat_list);
1294 }
1295 }
1296
1297 proc_t
1298 memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search)
1299 {
1300 memstat_bucket_t *current_bucket;
1301 proc_t next_p;
1302
1303 if ((*bucket_index) >= MEMSTAT_BUCKET_COUNT) {
1304 return NULL;
1305 }
1306
1307 current_bucket = &memstat_bucket[*bucket_index];
1308 next_p = TAILQ_FIRST(&current_bucket->list);
1309 if (!next_p && search) {
1310 while (!next_p && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
1311 current_bucket = &memstat_bucket[*bucket_index];
1312 next_p = TAILQ_FIRST(&current_bucket->list);
1313 }
1314 }
1315
1316 return next_p;
1317 }
1318
1319 proc_t
1320 memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search)
1321 {
1322 memstat_bucket_t *current_bucket;
1323 proc_t next_p;
1324
1325 if (!p || ((*bucket_index) >= MEMSTAT_BUCKET_COUNT)) {
1326 return NULL;
1327 }
1328
1329 next_p = TAILQ_NEXT(p, p_memstat_list);
1330 while (!next_p && search && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
1331 current_bucket = &memstat_bucket[*bucket_index];
1332 next_p = TAILQ_FIRST(&current_bucket->list);
1333 }
1334
1335 return next_p;
1336 }
1337
1338 /*
1339 * Structure to hold state for a jetsam thread.
1340 * Typically there should be a single jetsam thread
1341 * unless parallel jetsam is enabled.
1342 */
1343 struct jetsam_thread_state {
1344 uint8_t inited; /* boolean - if the thread is initialized */
1345 uint8_t limit_to_low_bands; /* boolean */
1346 int memorystatus_wakeup; /* wake channel */
1347 int index; /* jetsam thread index */
1348 thread_t thread; /* jetsam thread pointer */
1349 } *jetsam_threads;
1350
1351 /* Maximum number of jetsam threads allowed */
1352 #define JETSAM_THREADS_LIMIT 3
1353
1354 /* Number of active jetsam threads */
1355 _Atomic int active_jetsam_threads = 1;
1356
1357 /* Number of maximum jetsam threads configured */
1358 int max_jetsam_threads = JETSAM_THREADS_LIMIT;
1359
1360 /*
1361 * Global switch for enabling fast jetsam. Fast jetsam is
1362 * hooked up via the system_override() system call. It has the
1363 * following effects:
1364 * - Raise the jetsam threshold ("clear-the-deck")
1365 * - Enabled parallel jetsam on eligible devices
1366 */
1367 #if __AMP__
1368 int fast_jetsam_enabled = 1;
1369 #else /* __AMP__ */
1370 int fast_jetsam_enabled = 0;
1371 #endif /* __AMP__ */
1372
1373 #if CONFIG_DIRTYSTATUS_TRACKING
1374 int dirtystatus_tracking_enabled = 0;
1375 SYSCTL_INT(_kern, OID_AUTO, dirtystatus_tracking_enabled, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &dirtystatus_tracking_enabled, 0, "");
1376 #endif
1377
1378 /* Routine to find the jetsam state structure for the current jetsam thread */
1379 static inline struct jetsam_thread_state *
1380 jetsam_current_thread(void)
1381 {
1382 for (int thr_id = 0; thr_id < max_jetsam_threads; thr_id++) {
1383 if (jetsam_threads[thr_id].thread == current_thread()) {
1384 return &(jetsam_threads[thr_id]);
1385 }
1386 }
1387 return NULL;
1388 }
1389
1390
1391 __private_extern__ void
1392 memorystatus_init(void)
1393 {
1394 kern_return_t result;
1395 int i;
1396
1397 #if CONFIG_FREEZE
1398 memorystatus_freeze_jetsam_band = JETSAM_PRIORITY_UI_SUPPORT;
1399 memorystatus_frozen_processes_max = FREEZE_PROCESSES_MAX;
1400 memorystatus_frozen_shared_mb_max = ((MAX_FROZEN_SHARED_MB_PERCENT * max_task_footprint_mb) / 100); /* 10% of the system wide task limit */
1401 memorystatus_freeze_shared_mb_per_process_max = (memorystatus_frozen_shared_mb_max / 4);
1402 memorystatus_freeze_pages_min = FREEZE_PAGES_MIN;
1403 memorystatus_freeze_pages_max = FREEZE_PAGES_MAX;
1404 memorystatus_max_frozen_demotions_daily = MAX_FROZEN_PROCESS_DEMOTIONS;
1405 memorystatus_thaw_count_demotion_threshold = MIN_THAW_DEMOTION_THRESHOLD;
1406 #endif
1407
1408 #if DEVELOPMENT || DEBUG
1409 disconnect_page_mappings_lck_grp_attr = lck_grp_attr_alloc_init();
1410 disconnect_page_mappings_lck_grp = lck_grp_alloc_init("disconnect_page_mappings", disconnect_page_mappings_lck_grp_attr);
1411
1412 lck_mtx_init(&disconnect_page_mappings_mutex, disconnect_page_mappings_lck_grp, NULL);
1413
1414 if (kill_on_no_paging_space) {
1415 max_kill_priority = JETSAM_PRIORITY_MAX;
1416 }
1417 #endif
1418
1419 memorystatus_jetsam_fg_band_lock_grp_attr = lck_grp_attr_alloc_init();
1420 memorystatus_jetsam_fg_band_lock_grp =
1421 lck_grp_alloc_init("memorystatus_jetsam_fg_band", memorystatus_jetsam_fg_band_lock_grp_attr);
1422 lck_mtx_init(&memorystatus_jetsam_fg_band_lock, memorystatus_jetsam_fg_band_lock_grp, NULL);
1423
1424 /* Init buckets */
1425 for (i = 0; i < MEMSTAT_BUCKET_COUNT; i++) {
1426 TAILQ_INIT(&memstat_bucket[i].list);
1427 memstat_bucket[i].count = 0;
1428 memstat_bucket[i].relaunch_high_count = 0;
1429 }
1430 memorystatus_idle_demotion_call = thread_call_allocate((thread_call_func_t)memorystatus_perform_idle_demotion, NULL);
1431
1432 nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time);
1433 nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time);
1434
1435 #if CONFIG_JETSAM
1436 /* Apply overrides */
1437 if (!PE_parse_boot_argn("kern.jetsam_delta", &delta_percentage, sizeof(delta_percentage))) {
1438 PE_get_default("kern.jetsam_delta", &delta_percentage, sizeof(delta_percentage));
1439 }
1440 if (delta_percentage == 0) {
1441 delta_percentage = 5;
1442 }
1443 if (max_mem > config_jetsam_large_memory_cutoff) {
1444 critical_threshold_percentage = critical_threshold_percentage_larger_devices;
1445 delta_percentage = delta_percentage_larger_devices;
1446 }
1447 assert(delta_percentage < 100);
1448 if (!PE_parse_boot_argn("kern.jetsam_critical_threshold", &critical_threshold_percentage, sizeof(critical_threshold_percentage))) {
1449 PE_get_default("kern.jetsam_critical_threshold", &critical_threshold_percentage, sizeof(critical_threshold_percentage));
1450 }
1451 assert(critical_threshold_percentage < 100);
1452 PE_get_default("kern.jetsam_idle_offset", &idle_offset_percentage, sizeof(idle_offset_percentage));
1453 assert(idle_offset_percentage < 100);
1454 PE_get_default("kern.jetsam_pressure_threshold", &pressure_threshold_percentage, sizeof(pressure_threshold_percentage));
1455 assert(pressure_threshold_percentage < 100);
1456 PE_get_default("kern.jetsam_freeze_threshold", &freeze_threshold_percentage, sizeof(freeze_threshold_percentage));
1457 assert(freeze_threshold_percentage < 100);
1458
1459
1460 if (!PE_parse_boot_argn("jetsam_aging_policy", &jetsam_aging_policy,
1461 sizeof(jetsam_aging_policy))) {
1462 if (!PE_get_default("kern.jetsam_aging_policy", &jetsam_aging_policy,
1463 sizeof(jetsam_aging_policy))) {
1464 jetsam_aging_policy = kJetsamAgingPolicySysProcsReclaimedFirst;
1465 }
1466 }
1467
1468 if (jetsam_aging_policy > kJetsamAgingPolicyMax) {
1469 jetsam_aging_policy = kJetsamAgingPolicySysProcsReclaimedFirst;
1470 }
1471
1472 switch (jetsam_aging_policy) {
1473 case kJetsamAgingPolicyNone:
1474 system_procs_aging_band = JETSAM_PRIORITY_IDLE;
1475 applications_aging_band = JETSAM_PRIORITY_IDLE;
1476 break;
1477
1478 case kJetsamAgingPolicyLegacy:
1479 /*
1480 * Legacy behavior where some daemons get a 10s protection once
1481 * AND only before the first clean->dirty->clean transition before
1482 * going into IDLE band.
1483 */
1484 system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
1485 applications_aging_band = JETSAM_PRIORITY_IDLE;
1486 break;
1487
1488 case kJetsamAgingPolicySysProcsReclaimedFirst:
1489 system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
1490 applications_aging_band = JETSAM_PRIORITY_AGING_BAND2;
1491 break;
1492
1493 case kJetsamAgingPolicyAppsReclaimedFirst:
1494 system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND2;
1495 applications_aging_band = JETSAM_PRIORITY_AGING_BAND1;
1496 break;
1497
1498 default:
1499 break;
1500 }
1501
1502 /*
1503 * The aging bands cannot overlap with the JETSAM_PRIORITY_ELEVATED_INACTIVE
1504 * band and must be below it in priority. This is so that we don't have to make
1505 * our 'aging' code worry about a mix of processes, some of which need to age
1506 * and some others that need to stay elevated in the jetsam bands.
1507 */
1508 assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > system_procs_aging_band);
1509 assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > applications_aging_band);
1510
1511 /* Take snapshots for idle-exit kills by default? First check the boot-arg... */
1512 if (!PE_parse_boot_argn("jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot))) {
1513 /* ...no boot-arg, so check the device tree */
1514 PE_get_default("kern.jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot));
1515 }
1516
1517 memorystatus_delta = (unsigned int) (delta_percentage * atop_64(max_mem) / 100);
1518 memorystatus_available_pages_critical_idle_offset = (unsigned int) (idle_offset_percentage * atop_64(max_mem) / 100);
1519 memorystatus_available_pages_critical_base = (unsigned int) ((critical_threshold_percentage / delta_percentage) * memorystatus_delta);
1520 memorystatus_policy_more_free_offset_pages = (unsigned int) ((policy_more_free_offset_percentage / delta_percentage) * memorystatus_delta);
1521 memorystatus_sysproc_aging_aggr_pages = (unsigned int) (sysproc_aging_aggr_threshold_percentage * atop_64(max_mem) / 100);
1522
1523 /* Jetsam Loop Detection */
1524 if (max_mem <= (512 * 1024 * 1024)) {
1525 /* 512 MB devices */
1526 memorystatus_jld_eval_period_msecs = 8000; /* 8000 msecs == 8 second window */
1527 } else {
1528 /* 1GB and larger devices */
1529 memorystatus_jld_eval_period_msecs = 6000; /* 6000 msecs == 6 second window */
1530 }
1531
1532 memorystatus_jld_enabled = TRUE;
1533
1534 /* No contention at this point */
1535 memorystatus_update_levels_locked(FALSE);
1536
1537 #endif /* CONFIG_JETSAM */
1538
1539 #if __arm64__
1540 if (!PE_parse_boot_argn("entitled_max_task_pmem", &memorystatus_entitled_max_task_footprint_mb,
1541 sizeof(memorystatus_entitled_max_task_footprint_mb))) {
1542 if (!PE_get_default("kern.entitled_max_task_pmem", &memorystatus_entitled_max_task_footprint_mb,
1543 sizeof(memorystatus_entitled_max_task_footprint_mb))) {
1544 // entitled_max_task_pmem is not supported on this system.
1545 memorystatus_entitled_max_task_footprint_mb = 0;
1546 }
1547 }
1548 if (memorystatus_entitled_max_task_footprint_mb > max_mem / (1UL << 20) || memorystatus_entitled_max_task_footprint_mb < 0) {
1549 os_log_with_startup_serial(OS_LOG_DEFAULT, "Invalid value (%d) for entitled_max_task_pmem. Setting to 0",
1550 memorystatus_entitled_max_task_footprint_mb);
1551 }
1552 #endif /* __arm64__ */
1553
1554 memorystatus_jetsam_snapshot_max = maxproc;
1555
1556 memorystatus_jetsam_snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
1557 (sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max);
1558
1559 memorystatus_jetsam_snapshot = kalloc_flags(memorystatus_jetsam_snapshot_size, Z_WAITOK | Z_ZERO);
1560 if (!memorystatus_jetsam_snapshot) {
1561 panic("Could not allocate memorystatus_jetsam_snapshot");
1562 }
1563
1564 memorystatus_jetsam_snapshot_copy = kalloc_flags(memorystatus_jetsam_snapshot_size, Z_WAITOK | Z_ZERO);
1565 if (!memorystatus_jetsam_snapshot_copy) {
1566 panic("Could not allocate memorystatus_jetsam_snapshot_copy");
1567 }
1568
1569 #if CONFIG_FREEZE
1570 memorystatus_jetsam_snapshot_freezer_max = memorystatus_jetsam_snapshot_max / JETSAM_SNAPSHOT_FREEZER_MAX_FACTOR;
1571 memorystatus_jetsam_snapshot_freezer_size = sizeof(memorystatus_jetsam_snapshot_t) +
1572 (sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_freezer_max);
1573
1574 memorystatus_jetsam_snapshot_freezer = kalloc_flags(memorystatus_jetsam_snapshot_freezer_size, Z_WAITOK | Z_ZERO);
1575 if (!memorystatus_jetsam_snapshot_freezer) {
1576 panic("Could not allocate memorystatus_jetsam_snapshot_freezer");
1577 }
1578 #endif /* CONFIG_FREEZE */
1579
1580 nanoseconds_to_absolutetime((uint64_t)JETSAM_SNAPSHOT_TIMEOUT_SECS * NSEC_PER_SEC, &memorystatus_jetsam_snapshot_timeout);
1581
1582 memset(&memorystatus_at_boot_snapshot, 0, sizeof(memorystatus_jetsam_snapshot_t));
1583
1584 #if CONFIG_FREEZE
1585 memorystatus_freeze_threshold = (unsigned int) ((freeze_threshold_percentage / delta_percentage) * memorystatus_delta);
1586 #endif
1587
1588 /* Check the boot-arg to see if fast jetsam is allowed */
1589 if (!PE_parse_boot_argn("fast_jetsam_enabled", &fast_jetsam_enabled, sizeof(fast_jetsam_enabled))) {
1590 fast_jetsam_enabled = 0;
1591 }
1592
1593 /* Check the boot-arg to configure the maximum number of jetsam threads */
1594 if (!PE_parse_boot_argn("max_jetsam_threads", &max_jetsam_threads, sizeof(max_jetsam_threads))) {
1595 max_jetsam_threads = JETSAM_THREADS_LIMIT;
1596 }
1597
1598 /* Restrict the maximum number of jetsam threads to JETSAM_THREADS_LIMIT */
1599 if (max_jetsam_threads > JETSAM_THREADS_LIMIT) {
1600 max_jetsam_threads = JETSAM_THREADS_LIMIT;
1601 }
1602
1603 /* For low CPU systems disable fast jetsam mechanism */
1604 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
1605 max_jetsam_threads = 1;
1606 fast_jetsam_enabled = 0;
1607 }
1608
1609 /* Initialize the jetsam_threads state array */
1610 jetsam_threads = zalloc_permanent(sizeof(struct jetsam_thread_state) *
1611 max_jetsam_threads, ZALIGN(struct jetsam_thread_state));
1612
1613 /* Initialize all the jetsam threads */
1614 for (i = 0; i < max_jetsam_threads; i++) {
1615 jetsam_threads[i].inited = FALSE;
1616 jetsam_threads[i].index = i;
1617 result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &jetsam_threads[i].thread);
1618 if (result != KERN_SUCCESS) {
1619 panic("Could not create memorystatus_thread %d", i);
1620 }
1621 thread_deallocate(jetsam_threads[i].thread);
1622 }
1623 }
1624
1625 /* Centralised for the purposes of allowing panic-on-jetsam */
1626 extern void
1627 vm_run_compactor(void);
1628
1629 /*
1630 * The jetsam no frills kill call
1631 * Return: 0 on success
1632 * error code on failure (EINVAL...)
1633 */
1634 static int
1635 jetsam_do_kill(proc_t p, int jetsam_flags, os_reason_t jetsam_reason)
1636 {
1637 int error = 0;
1638 error = exit_with_reason(p, W_EXITCODE(0, SIGKILL), (int *)NULL, FALSE, FALSE, jetsam_flags, jetsam_reason);
1639 return error;
1640 }
1641
1642 /*
1643 * Wrapper for processes exiting with memorystatus details
1644 */
1645 static boolean_t
1646 memorystatus_do_kill(proc_t p, uint32_t cause, os_reason_t jetsam_reason, uint64_t *footprint_of_killed_proc)
1647 {
1648 int error = 0;
1649 __unused pid_t victim_pid = p->p_pid;
1650 uint64_t footprint = get_task_phys_footprint(p->task);
1651 #if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD)
1652 int32_t memstat_effectivepriority = p->p_memstat_effectivepriority;
1653 #endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */
1654
1655 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_START,
1656 victim_pid, cause, vm_page_free_count, footprint, 0);
1657 DTRACE_MEMORYSTATUS4(memorystatus_do_kill, proc_t, p, os_reason_t, jetsam_reason, uint32_t, cause, uint64_t, footprint);
1658 #if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
1659 if (memorystatus_jetsam_panic_debug & (1 << cause)) {
1660 panic("memorystatus_do_kill(): jetsam debug panic (cause: %d)", cause);
1661 }
1662 #else
1663 #pragma unused(cause)
1664 #endif
1665
1666 if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) {
1667 printf("memorystatus: killing process %d [%s] in high band %s (%d) - memorystatus_available_pages: %llu\n", p->p_pid,
1668 (*p->p_name ? p->p_name : "unknown"),
1669 memorystatus_priority_band_name(p->p_memstat_effectivepriority), p->p_memstat_effectivepriority,
1670 (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
1671 }
1672
1673 /*
1674 * The jetsam_reason (os_reason_t) has enough information about the kill cause.
1675 * We don't really need jetsam_flags anymore, so it's okay that not all possible kill causes have been mapped.
1676 */
1677 int jetsam_flags = P_LTERM_JETSAM;
1678 switch (cause) {
1679 case kMemorystatusKilledHiwat: jetsam_flags |= P_JETSAM_HIWAT; break;
1680 case kMemorystatusKilledVnodes: jetsam_flags |= P_JETSAM_VNODE; break;
1681 case kMemorystatusKilledVMPageShortage: jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break;
1682 case kMemorystatusKilledVMCompressorThrashing:
1683 case kMemorystatusKilledVMCompressorSpaceShortage: jetsam_flags |= P_JETSAM_VMTHRASHING; break;
1684 case kMemorystatusKilledFCThrashing: jetsam_flags |= P_JETSAM_FCTHRASHING; break;
1685 case kMemorystatusKilledPerProcessLimit: jetsam_flags |= P_JETSAM_PID; break;
1686 case kMemorystatusKilledIdleExit: jetsam_flags |= P_JETSAM_IDLEEXIT; break;
1687 }
1688 error = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
1689 *footprint_of_killed_proc = ((error == 0) ? footprint : 0);
1690
1691 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_END,
1692 victim_pid, memstat_effectivepriority, vm_page_free_count, error, 0);
1693
1694 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_COMPACTOR_RUN)) | DBG_FUNC_START,
1695 victim_pid, cause, vm_page_free_count, *footprint_of_killed_proc, 0);
1696
1697 vm_run_compactor();
1698
1699 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_COMPACTOR_RUN)) | DBG_FUNC_END,
1700 victim_pid, cause, vm_page_free_count, 0, 0);
1701
1702 return error == 0;
1703 }
1704
1705 /*
1706 * Node manipulation
1707 */
1708
1709 static void
1710 memorystatus_check_levels_locked(void)
1711 {
1712 #if CONFIG_JETSAM
1713 /* Update levels */
1714 memorystatus_update_levels_locked(TRUE);
1715 #else /* CONFIG_JETSAM */
1716 /*
1717 * Nothing to do here currently since we update
1718 * memorystatus_available_pages in vm_pressure_response.
1719 */
1720 #endif /* CONFIG_JETSAM */
1721 }
1722
1723 /*
1724 * Pin a process to a particular jetsam band when it is in the background i.e. not doing active work.
1725 * For an application: that means no longer in the FG band
1726 * For a daemon: that means no longer in its 'requested' jetsam priority band
1727 */
1728
1729 int
1730 memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags, int jetsam_prio, boolean_t effective_now)
1731 {
1732 int error = 0;
1733 boolean_t enable = FALSE;
1734 proc_t p = NULL;
1735
1736 if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE) {
1737 enable = TRUE;
1738 } else if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE) {
1739 enable = FALSE;
1740 } else {
1741 return EINVAL;
1742 }
1743
1744 p = proc_find(pid);
1745 if (p != NULL) {
1746 if ((enable && ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) == P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) ||
1747 (!enable && ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) == 0))) {
1748 /*
1749 * No change in state.
1750 */
1751 } else {
1752 proc_list_lock();
1753
1754 if (enable) {
1755 p->p_memstat_state |= P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
1756 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1757
1758 if (effective_now) {
1759 if (p->p_memstat_effectivepriority < jetsam_prio) {
1760 if (memorystatus_highwater_enabled) {
1761 /*
1762 * Process is about to transition from
1763 * inactive --> active
1764 * assign active state
1765 */
1766 boolean_t is_fatal;
1767 boolean_t use_active = TRUE;
1768 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
1769 task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, use_active, is_fatal);
1770 }
1771 memorystatus_update_priority_locked(p, jetsam_prio, FALSE, FALSE);
1772 }
1773 } else {
1774 if (isProcessInAgingBands(p)) {
1775 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
1776 }
1777 }
1778 } else {
1779 p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
1780 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1781
1782 if (effective_now) {
1783 if (p->p_memstat_effectivepriority == jetsam_prio) {
1784 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
1785 }
1786 } else {
1787 if (isProcessInAgingBands(p)) {
1788 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
1789 }
1790 }
1791 }
1792
1793 proc_list_unlock();
1794 }
1795 proc_rele(p);
1796 error = 0;
1797 } else {
1798 error = ESRCH;
1799 }
1800
1801 return error;
1802 }
1803
1804 static void
1805 memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2)
1806 {
1807 proc_t p;
1808 uint64_t current_time = 0, idle_delay_time = 0;
1809 int demote_prio_band = 0;
1810 memstat_bucket_t *demotion_bucket;
1811
1812 MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion()\n");
1813
1814 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_START, 0, 0, 0, 0, 0);
1815
1816 current_time = mach_absolute_time();
1817
1818 proc_list_lock();
1819
1820 demote_prio_band = JETSAM_PRIORITY_IDLE + 1;
1821
1822 for (; demote_prio_band < JETSAM_PRIORITY_MAX; demote_prio_band++) {
1823 if (demote_prio_band != system_procs_aging_band && demote_prio_band != applications_aging_band) {
1824 continue;
1825 }
1826
1827 demotion_bucket = &memstat_bucket[demote_prio_band];
1828 p = TAILQ_FIRST(&demotion_bucket->list);
1829
1830 while (p) {
1831 MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion() found %d\n", p->p_pid);
1832
1833 assert(p->p_memstat_idledeadline);
1834
1835 assert(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS);
1836
1837 if (current_time >= p->p_memstat_idledeadline) {
1838 if ((isSysProc(p) &&
1839 ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED | P_DIRTY_IS_DIRTY)) != P_DIRTY_IDLE_EXIT_ENABLED)) || /* system proc marked dirty*/
1840 task_has_assertions((struct task *)(p->task))) { /* has outstanding assertions which might indicate outstanding work too */
1841 idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_time(p) : memorystatus_apps_idle_time(p);
1842
1843 p->p_memstat_idledeadline += idle_delay_time;
1844 p = TAILQ_NEXT(p, p_memstat_list);
1845 } else {
1846 proc_t next_proc = NULL;
1847
1848 next_proc = TAILQ_NEXT(p, p_memstat_list);
1849 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1850
1851 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, false, true);
1852
1853 p = next_proc;
1854 continue;
1855 }
1856 } else {
1857 // No further candidates
1858 break;
1859 }
1860 }
1861 }
1862
1863 memorystatus_reschedule_idle_demotion_locked();
1864
1865 proc_list_unlock();
1866
1867 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
1868 }
1869
1870 static void
1871 memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state)
1872 {
1873 boolean_t present_in_sysprocs_aging_bucket = FALSE;
1874 boolean_t present_in_apps_aging_bucket = FALSE;
1875 uint64_t idle_delay_time = 0;
1876
1877 if (jetsam_aging_policy == kJetsamAgingPolicyNone) {
1878 return;
1879 }
1880
1881 if ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) ||
1882 (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION)) {
1883 /*
1884 * This process isn't going to be making the trip to the lower bands.
1885 */
1886 return;
1887 }
1888
1889 if (isProcessInAgingBands(p)) {
1890 if (jetsam_aging_policy != kJetsamAgingPolicyLegacy) {
1891 assert((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) != P_DIRTY_AGING_IN_PROGRESS);
1892 }
1893
1894 if (isSysProc(p) && system_procs_aging_band) {
1895 present_in_sysprocs_aging_bucket = TRUE;
1896 } else if (isApp(p) && applications_aging_band) {
1897 present_in_apps_aging_bucket = TRUE;
1898 }
1899 }
1900
1901 assert(!present_in_sysprocs_aging_bucket);
1902 assert(!present_in_apps_aging_bucket);
1903
1904 MEMORYSTATUS_DEBUG(1, "memorystatus_schedule_idle_demotion_locked: scheduling demotion to idle band for pid %d (dirty:0x%x, set_state %d, demotions %d).\n",
1905 p->p_pid, p->p_memstat_dirty, set_state, (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps));
1906
1907 if (isSysProc(p)) {
1908 assert((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED);
1909 }
1910
1911 idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_time(p) : memorystatus_apps_idle_time(p);
1912 if (set_state) {
1913 p->p_memstat_dirty |= P_DIRTY_AGING_IN_PROGRESS;
1914 p->p_memstat_idledeadline = mach_absolute_time() + idle_delay_time;
1915 }
1916
1917 assert(p->p_memstat_idledeadline);
1918
1919 if (isSysProc(p) && present_in_sysprocs_aging_bucket == FALSE) {
1920 memorystatus_scheduled_idle_demotions_sysprocs++;
1921 } else if (isApp(p) && present_in_apps_aging_bucket == FALSE) {
1922 memorystatus_scheduled_idle_demotions_apps++;
1923 }
1924 }
1925
1926 void
1927 memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clear_state)
1928 {
1929 boolean_t present_in_sysprocs_aging_bucket = FALSE;
1930 boolean_t present_in_apps_aging_bucket = FALSE;
1931
1932 if (!system_procs_aging_band && !applications_aging_band) {
1933 return;
1934 }
1935
1936 if ((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) == 0) {
1937 return;
1938 }
1939
1940 if (isProcessInAgingBands(p)) {
1941 if (jetsam_aging_policy != kJetsamAgingPolicyLegacy) {
1942 assert((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) == P_DIRTY_AGING_IN_PROGRESS);
1943 }
1944
1945 if (isSysProc(p) && system_procs_aging_band) {
1946 assert(p->p_memstat_effectivepriority == system_procs_aging_band);
1947 assert(p->p_memstat_idledeadline);
1948 present_in_sysprocs_aging_bucket = TRUE;
1949 } else if (isApp(p) && applications_aging_band) {
1950 assert(p->p_memstat_effectivepriority == applications_aging_band);
1951 assert(p->p_memstat_idledeadline);
1952 present_in_apps_aging_bucket = TRUE;
1953 }
1954 }
1955
1956 MEMORYSTATUS_DEBUG(1, "memorystatus_invalidate_idle_demotion(): invalidating demotion to idle band for pid %d (clear_state %d, demotions %d).\n",
1957 p->p_pid, clear_state, (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps));
1958
1959
1960 if (clear_state) {
1961 p->p_memstat_idledeadline = 0;
1962 p->p_memstat_dirty &= ~P_DIRTY_AGING_IN_PROGRESS;
1963 }
1964
1965 if (isSysProc(p) && present_in_sysprocs_aging_bucket == TRUE) {
1966 memorystatus_scheduled_idle_demotions_sysprocs--;
1967 assert(memorystatus_scheduled_idle_demotions_sysprocs >= 0);
1968 } else if (isApp(p) && present_in_apps_aging_bucket == TRUE) {
1969 memorystatus_scheduled_idle_demotions_apps--;
1970 assert(memorystatus_scheduled_idle_demotions_apps >= 0);
1971 }
1972
1973 assert((memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps) >= 0);
1974 }
1975
1976 static void
1977 memorystatus_reschedule_idle_demotion_locked(void)
1978 {
1979 if (0 == (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps)) {
1980 if (memstat_idle_demotion_deadline) {
1981 /* Transitioned 1->0, so cancel next call */
1982 thread_call_cancel(memorystatus_idle_demotion_call);
1983 memstat_idle_demotion_deadline = 0;
1984 }
1985 } else {
1986 memstat_bucket_t *demotion_bucket;
1987 proc_t p = NULL, p1 = NULL, p2 = NULL;
1988
1989 if (system_procs_aging_band) {
1990 demotion_bucket = &memstat_bucket[system_procs_aging_band];
1991 p1 = TAILQ_FIRST(&demotion_bucket->list);
1992
1993 p = p1;
1994 }
1995
1996 if (applications_aging_band) {
1997 demotion_bucket = &memstat_bucket[applications_aging_band];
1998 p2 = TAILQ_FIRST(&demotion_bucket->list);
1999
2000 if (p1 && p2) {
2001 p = (p1->p_memstat_idledeadline > p2->p_memstat_idledeadline) ? p2 : p1;
2002 } else {
2003 p = (p1 == NULL) ? p2 : p1;
2004 }
2005 }
2006
2007 assert(p);
2008
2009 if (p != NULL) {
2010 assert(p && p->p_memstat_idledeadline);
2011 if (memstat_idle_demotion_deadline != p->p_memstat_idledeadline) {
2012 thread_call_enter_delayed(memorystatus_idle_demotion_call, p->p_memstat_idledeadline);
2013 memstat_idle_demotion_deadline = p->p_memstat_idledeadline;
2014 }
2015 }
2016 }
2017 }
2018
2019 /*
2020 * List manipulation
2021 */
2022
2023 int
2024 memorystatus_add(proc_t p, boolean_t locked)
2025 {
2026 memstat_bucket_t *bucket;
2027
2028 MEMORYSTATUS_DEBUG(1, "memorystatus_list_add(): adding pid %d with priority %d.\n", p->p_pid, p->p_memstat_effectivepriority);
2029
2030 if (!locked) {
2031 proc_list_lock();
2032 }
2033
2034 DTRACE_MEMORYSTATUS2(memorystatus_add, proc_t, p, int32_t, p->p_memstat_effectivepriority);
2035
2036 /* Processes marked internal do not have priority tracked */
2037 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
2038 goto exit;
2039 }
2040
2041 /*
2042 * Opt out system processes from being frozen by default.
2043 * For coalition-based freezing, we only want to freeze sysprocs that have specifically opted in.
2044 */
2045 if (isSysProc(p)) {
2046 p->p_memstat_state |= P_MEMSTAT_FREEZE_DISABLED;
2047 }
2048
2049 bucket = &memstat_bucket[p->p_memstat_effectivepriority];
2050
2051 if (isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) {
2052 assert(bucket->count == memorystatus_scheduled_idle_demotions_sysprocs - 1);
2053 } else if (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)) {
2054 assert(bucket->count == memorystatus_scheduled_idle_demotions_apps - 1);
2055 } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2056 /*
2057 * Entering the idle band.
2058 * Record idle start time.
2059 */
2060 p->p_memstat_idle_start = mach_absolute_time();
2061 }
2062
2063 TAILQ_INSERT_TAIL(&bucket->list, p, p_memstat_list);
2064 bucket->count++;
2065 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2066 bucket->relaunch_high_count++;
2067 }
2068
2069 memorystatus_list_count++;
2070
2071 memorystatus_check_levels_locked();
2072
2073 exit:
2074 if (!locked) {
2075 proc_list_unlock();
2076 }
2077
2078 return 0;
2079 }
2080
2081 /*
2082 * Description:
2083 * Moves a process from one jetsam bucket to another.
2084 * which changes the LRU position of the process.
2085 *
2086 * Monitors transition between buckets and if necessary
2087 * will update cached memory limits accordingly.
2088 *
2089 * skip_demotion_check:
2090 * - if the 'jetsam aging policy' is NOT 'legacy':
2091 * When this flag is TRUE, it means we are going
2092 * to age the ripe processes out of the aging bands and into the
2093 * IDLE band and apply their inactive memory limits.
2094 *
2095 * - if the 'jetsam aging policy' is 'legacy':
2096 * When this flag is TRUE, it might mean the above aging mechanism
2097 * OR
2098 * It might be that we have a process that has used up its 'idle deferral'
2099 * stay that is given to it once per lifetime. And in this case, the process
2100 * won't be going through any aging codepaths. But we still need to apply
2101 * the right inactive limits and so we explicitly set this to TRUE if the
2102 * new priority for the process is the IDLE band.
2103 */
2104 void
2105 memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert, boolean_t skip_demotion_check)
2106 {
2107 memstat_bucket_t *old_bucket, *new_bucket;
2108
2109 assert(priority < MEMSTAT_BUCKET_COUNT);
2110
2111 /* Ensure that exit isn't underway, leaving the proc retained but removed from its bucket */
2112 if ((p->p_listflag & P_LIST_EXITED) != 0) {
2113 return;
2114 }
2115
2116 MEMORYSTATUS_DEBUG(1, "memorystatus_update_priority_locked(): setting %s(%d) to priority %d, inserting at %s\n",
2117 (*p->p_name ? p->p_name : "unknown"), p->p_pid, priority, head_insert ? "head" : "tail");
2118
2119 DTRACE_MEMORYSTATUS3(memorystatus_update_priority, proc_t, p, int32_t, p->p_memstat_effectivepriority, int, priority);
2120
2121 old_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
2122
2123 if (skip_demotion_check == FALSE) {
2124 if (isSysProc(p)) {
2125 /*
2126 * For system processes, the memorystatus_dirty_* routines take care of adding/removing
2127 * the processes from the aging bands and balancing the demotion counts.
2128 * We can, however, override that if the process has an 'elevated inactive jetsam band' attribute.
2129 */
2130
2131 if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) {
2132 /*
2133 * 2 types of processes can use the non-standard elevated inactive band:
2134 * - Frozen processes that always land in memorystatus_freeze_jetsam_band
2135 * OR
2136 * - processes that specifically opt-in to the elevated inactive support e.g. docked processes.
2137 */
2138 #if CONFIG_FREEZE
2139 if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
2140 if (priority <= memorystatus_freeze_jetsam_band) {
2141 priority = memorystatus_freeze_jetsam_band;
2142 }
2143 } else
2144 #endif /* CONFIG_FREEZE */
2145 {
2146 if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE) {
2147 priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
2148 }
2149 }
2150 assert(!(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS));
2151 }
2152 } else if (isApp(p)) {
2153 /*
2154 * Check to see if the application is being lowered in jetsam priority. If so, and:
2155 * - it has an 'elevated inactive jetsam band' attribute, then put it in the appropriate band.
2156 * - it is a normal application, then let it age in the aging band if that policy is in effect.
2157 */
2158
2159 if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) {
2160 #if CONFIG_FREEZE
2161 if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
2162 if (priority <= memorystatus_freeze_jetsam_band) {
2163 priority = memorystatus_freeze_jetsam_band;
2164 }
2165 } else
2166 #endif /* CONFIG_FREEZE */
2167 {
2168 if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE) {
2169 priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
2170 }
2171 }
2172 } else {
2173 if (applications_aging_band) {
2174 if (p->p_memstat_effectivepriority == applications_aging_band) {
2175 assert(old_bucket->count == (memorystatus_scheduled_idle_demotions_apps + 1));
2176 }
2177
2178 if ((jetsam_aging_policy != kJetsamAgingPolicyLegacy) && (priority <= applications_aging_band)) {
2179 assert(!(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS));
2180 priority = applications_aging_band;
2181 memorystatus_schedule_idle_demotion_locked(p, TRUE);
2182 }
2183 }
2184 }
2185 }
2186 }
2187
2188 if ((system_procs_aging_band && (priority == system_procs_aging_band)) || (applications_aging_band && (priority == applications_aging_band))) {
2189 assert(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS);
2190 }
2191
2192 #if DEVELOPMENT || DEBUG
2193 if (priority == JETSAM_PRIORITY_IDLE && /* if the process is on its way into the IDLE band */
2194 skip_demotion_check == FALSE && /* and it isn't via the path that will set the INACTIVE memlimits */
2195 (p->p_memstat_dirty & P_DIRTY_TRACK) && /* and it has 'DIRTY' tracking enabled */
2196 ((p->p_memstat_memlimit != p->p_memstat_memlimit_inactive) || /* and we notice that the current limit isn't the right value (inactive) */
2197 ((p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) ? (!(p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT)) : (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT)))) { /* OR type (fatal vs non-fatal) */
2198 printf("memorystatus_update_priority_locked: on %s with 0x%x, prio: %d and %d\n", p->p_name, p->p_memstat_state, priority, p->p_memstat_memlimit); /* then we must catch this */
2199 }
2200 #endif /* DEVELOPMENT || DEBUG */
2201
2202 TAILQ_REMOVE(&old_bucket->list, p, p_memstat_list);
2203 old_bucket->count--;
2204 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2205 old_bucket->relaunch_high_count--;
2206 }
2207
2208 new_bucket = &memstat_bucket[priority];
2209 if (head_insert) {
2210 TAILQ_INSERT_HEAD(&new_bucket->list, p, p_memstat_list);
2211 } else {
2212 TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
2213 }
2214 new_bucket->count++;
2215 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2216 new_bucket->relaunch_high_count++;
2217 }
2218
2219 if (memorystatus_highwater_enabled) {
2220 boolean_t is_fatal;
2221 boolean_t use_active;
2222
2223 /*
2224 * If cached limit data is updated, then the limits
2225 * will be enforced by writing to the ledgers.
2226 */
2227 boolean_t ledger_update_needed = TRUE;
2228
2229 /*
2230 * Here, we must update the cached memory limit if the task
2231 * is transitioning between:
2232 * active <--> inactive
2233 * FG <--> BG
2234 * but:
2235 * dirty <--> clean is ignored
2236 *
2237 * We bypass non-idle processes that have opted into dirty tracking because
2238 * a move between buckets does not imply a transition between the
2239 * dirty <--> clean state.
2240 */
2241
2242 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
2243 if (skip_demotion_check == TRUE && priority == JETSAM_PRIORITY_IDLE) {
2244 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
2245 use_active = FALSE;
2246 } else {
2247 ledger_update_needed = FALSE;
2248 }
2249 } else if ((priority >= JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority < JETSAM_PRIORITY_FOREGROUND)) {
2250 /*
2251 * inactive --> active
2252 * BG --> FG
2253 * assign active state
2254 */
2255 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
2256 use_active = TRUE;
2257 } else if ((priority < JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) {
2258 /*
2259 * active --> inactive
2260 * FG --> BG
2261 * assign inactive state
2262 */
2263 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
2264 use_active = FALSE;
2265 } else {
2266 /*
2267 * The transition between jetsam priority buckets apparently did
2268 * not affect active/inactive state.
2269 * This is not unusual... especially during startup when
2270 * processes are getting established in their respective bands.
2271 */
2272 ledger_update_needed = FALSE;
2273 }
2274
2275 /*
2276 * Enforce the new limits by writing to the ledger
2277 */
2278 if (ledger_update_needed) {
2279 task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, use_active, is_fatal);
2280
2281 MEMORYSTATUS_DEBUG(3, "memorystatus_update_priority_locked: new limit on pid %d (%dMB %s) priority old --> new (%d --> %d) dirty?=0x%x %s\n",
2282 p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
2283 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, priority, p->p_memstat_dirty,
2284 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
2285 }
2286 }
2287
2288 /*
2289 * Record idle start or idle delta.
2290 */
2291 if (p->p_memstat_effectivepriority == priority) {
2292 /*
2293 * This process is not transitioning between
2294 * jetsam priority buckets. Do nothing.
2295 */
2296 } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2297 uint64_t now;
2298 /*
2299 * Transitioning out of the idle priority bucket.
2300 * Record idle delta.
2301 */
2302 assert(p->p_memstat_idle_start != 0);
2303 now = mach_absolute_time();
2304 if (now > p->p_memstat_idle_start) {
2305 p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
2306 }
2307
2308 /*
2309 * About to become active and so memory footprint could change.
2310 * So mark it eligible for freeze-considerations next time around.
2311 */
2312 if (p->p_memstat_state & P_MEMSTAT_FREEZE_IGNORE) {
2313 p->p_memstat_state &= ~P_MEMSTAT_FREEZE_IGNORE;
2314 }
2315 } else if (priority == JETSAM_PRIORITY_IDLE) {
2316 /*
2317 * Transitioning into the idle priority bucket.
2318 * Record idle start.
2319 */
2320 p->p_memstat_idle_start = mach_absolute_time();
2321 }
2322
2323 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CHANGE_PRIORITY), p->p_pid, priority, p->p_memstat_effectivepriority, 0, 0);
2324
2325 p->p_memstat_effectivepriority = priority;
2326
2327 #if CONFIG_SECLUDED_MEMORY
2328 if (secluded_for_apps &&
2329 task_could_use_secluded_mem(p->task)) {
2330 task_set_can_use_secluded_mem(
2331 p->task,
2332 (priority >= JETSAM_PRIORITY_FOREGROUND));
2333 }
2334 #endif /* CONFIG_SECLUDED_MEMORY */
2335
2336 memorystatus_check_levels_locked();
2337 }
2338
2339 int
2340 memorystatus_relaunch_flags_update(proc_t p, int relaunch_flags)
2341 {
2342 p->p_memstat_relaunch_flags = relaunch_flags;
2343 KDBG(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_RELAUNCH_FLAGS), p->p_pid, relaunch_flags, 0, 0, 0);
2344 return 0;
2345 }
2346
2347 /*
2348 *
2349 * Description: Update the jetsam priority and memory limit attributes for a given process.
2350 *
2351 * Parameters:
2352 * p init this process's jetsam information.
2353 * priority The jetsam priority band
2354 * user_data user specific data, unused by the kernel
2355 * is_assertion When true, a priority update is driven by an assertion.
2356 * effective guards against race if process's update already occurred
2357 * update_memlimit When true we know this is the init step via the posix_spawn path.
2358 *
2359 * memlimit_active Value in megabytes; The monitored footprint level while the
2360 * process is active. Exceeding it may result in termination
2361 * based on it's associated fatal flag.
2362 *
2363 * memlimit_active_is_fatal When a process is active and exceeds its memory footprint,
2364 * this describes whether or not it should be immediately fatal.
2365 *
2366 * memlimit_inactive Value in megabytes; The monitored footprint level while the
2367 * process is inactive. Exceeding it may result in termination
2368 * based on it's associated fatal flag.
2369 *
2370 * memlimit_inactive_is_fatal When a process is inactive and exceeds its memory footprint,
2371 * this describes whether or not it should be immediatly fatal.
2372 *
2373 * Returns: 0 Success
2374 * non-0 Failure
2375 */
2376
2377 int
2378 memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t is_assertion, boolean_t effective, boolean_t update_memlimit,
2379 int32_t memlimit_active, boolean_t memlimit_active_is_fatal,
2380 int32_t memlimit_inactive, boolean_t memlimit_inactive_is_fatal)
2381 {
2382 int ret;
2383 boolean_t head_insert = false;
2384
2385 MEMORYSTATUS_DEBUG(1, "memorystatus_update: changing (%s) pid %d: priority %d, user_data 0x%llx\n", (*p->p_name ? p->p_name : "unknown"), p->p_pid, priority, user_data);
2386
2387 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_START, p->p_pid, priority, user_data, effective, 0);
2388
2389 if (priority == -1) {
2390 /* Use as shorthand for default priority */
2391 priority = JETSAM_PRIORITY_DEFAULT;
2392 } else if ((priority == system_procs_aging_band) || (priority == applications_aging_band)) {
2393 /* Both the aging bands are reserved for internal use; if requested, adjust to JETSAM_PRIORITY_IDLE. */
2394 priority = JETSAM_PRIORITY_IDLE;
2395 } else if (priority == JETSAM_PRIORITY_IDLE_HEAD) {
2396 /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle queue */
2397 priority = JETSAM_PRIORITY_IDLE;
2398 head_insert = TRUE;
2399 } else if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) {
2400 /* Sanity check */
2401 ret = EINVAL;
2402 goto out;
2403 }
2404
2405 proc_list_lock();
2406
2407 assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
2408
2409 if (effective && (p->p_memstat_state & P_MEMSTAT_PRIORITYUPDATED)) {
2410 ret = EALREADY;
2411 proc_list_unlock();
2412 MEMORYSTATUS_DEBUG(1, "memorystatus_update: effective change specified for pid %d, but change already occurred.\n", p->p_pid);
2413 goto out;
2414 }
2415
2416 if ((p->p_memstat_state & P_MEMSTAT_TERMINATED) || ((p->p_listflag & P_LIST_EXITED) != 0)) {
2417 /*
2418 * This could happen when a process calling posix_spawn() is exiting on the jetsam thread.
2419 */
2420 ret = EBUSY;
2421 proc_list_unlock();
2422 goto out;
2423 }
2424
2425 p->p_memstat_state |= P_MEMSTAT_PRIORITYUPDATED;
2426 p->p_memstat_userdata = user_data;
2427
2428 if (is_assertion) {
2429 if (priority == JETSAM_PRIORITY_IDLE) {
2430 /*
2431 * Assertions relinquish control when the process is heading to IDLE.
2432 */
2433 if (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) {
2434 /*
2435 * Mark the process as no longer being managed by assertions.
2436 */
2437 p->p_memstat_state &= ~P_MEMSTAT_PRIORITY_ASSERTION;
2438 } else {
2439 /*
2440 * Ignore an idle priority transition if the process is not
2441 * already managed by assertions. We won't treat this as
2442 * an error, but we will log the unexpected behavior and bail.
2443 */
2444 os_log(OS_LOG_DEFAULT, "memorystatus: Ignore assertion driven idle priority. Process not previously controlled %s:%d\n",
2445 (*p->p_name ? p->p_name : "unknown"), p->p_pid);
2446
2447 ret = 0;
2448 proc_list_unlock();
2449 goto out;
2450 }
2451 } else {
2452 /*
2453 * Process is now being managed by assertions,
2454 */
2455 p->p_memstat_state |= P_MEMSTAT_PRIORITY_ASSERTION;
2456 }
2457
2458 /* Always update the assertion priority in this path */
2459
2460 p->p_memstat_assertionpriority = priority;
2461
2462 int memstat_dirty_flags = memorystatus_dirty_get(p, TRUE); /* proc_list_lock is held */
2463
2464 if (memstat_dirty_flags != 0) {
2465 /*
2466 * Calculate maximum priority only when dirty tracking processes are involved.
2467 */
2468 int maxpriority;
2469 if (memstat_dirty_flags & PROC_DIRTY_IS_DIRTY) {
2470 maxpriority = MAX(p->p_memstat_assertionpriority, p->p_memstat_requestedpriority);
2471 } else {
2472 /* clean */
2473
2474 if (memstat_dirty_flags & PROC_DIRTY_ALLOWS_IDLE_EXIT) {
2475 /*
2476 * The aging policy must be evaluated and applied here because runnningboardd
2477 * has relinquished its hold on the jetsam priority by attempting to move a
2478 * clean process to the idle band.
2479 */
2480
2481 int newpriority = JETSAM_PRIORITY_IDLE;
2482 if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED | P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED) {
2483 newpriority = (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) ? system_procs_aging_band : JETSAM_PRIORITY_IDLE;
2484 }
2485
2486 maxpriority = MAX(p->p_memstat_assertionpriority, newpriority );
2487
2488 if (newpriority == system_procs_aging_band) {
2489 memorystatus_schedule_idle_demotion_locked(p, FALSE);
2490 }
2491 } else {
2492 /*
2493 * Preserves requestedpriority when the process does not support pressured exit.
2494 */
2495 maxpriority = MAX(p->p_memstat_assertionpriority, p->p_memstat_requestedpriority);
2496 }
2497 }
2498 priority = maxpriority;
2499 }
2500 } else {
2501 p->p_memstat_requestedpriority = priority;
2502 }
2503
2504 if (update_memlimit) {
2505 boolean_t is_fatal;
2506 boolean_t use_active;
2507
2508 /*
2509 * Posix_spawn'd processes come through this path to instantiate ledger limits.
2510 * Forked processes do not come through this path, so no ledger limits exist.
2511 * (That's why forked processes can consume unlimited memory.)
2512 */
2513
2514 MEMORYSTATUS_DEBUG(3, "memorystatus_update(enter): pid %d, priority %d, dirty=0x%x, Active(%dMB %s), Inactive(%dMB, %s)\n",
2515 p->p_pid, priority, p->p_memstat_dirty,
2516 memlimit_active, (memlimit_active_is_fatal ? "F " : "NF"),
2517 memlimit_inactive, (memlimit_inactive_is_fatal ? "F " : "NF"));
2518
2519 if (memlimit_active <= 0) {
2520 /*
2521 * This process will have a system_wide task limit when active.
2522 * System_wide task limit is always fatal.
2523 * It's quite common to see non-fatal flag passed in here.
2524 * It's not an error, we just ignore it.
2525 */
2526
2527 /*
2528 * For backward compatibility with some unexplained launchd behavior,
2529 * we allow a zero sized limit. But we still enforce system_wide limit
2530 * when written to the ledgers.
2531 */
2532
2533 if (memlimit_active < 0) {
2534 memlimit_active = -1; /* enforces system_wide task limit */
2535 }
2536 memlimit_active_is_fatal = TRUE;
2537 }
2538
2539 if (memlimit_inactive <= 0) {
2540 /*
2541 * This process will have a system_wide task limit when inactive.
2542 * System_wide task limit is always fatal.
2543 */
2544
2545 memlimit_inactive = -1;
2546 memlimit_inactive_is_fatal = TRUE;
2547 }
2548
2549 /*
2550 * Initialize the active limit variants for this process.
2551 */
2552 SET_ACTIVE_LIMITS_LOCKED(p, memlimit_active, memlimit_active_is_fatal);
2553
2554 /*
2555 * Initialize the inactive limit variants for this process.
2556 */
2557 SET_INACTIVE_LIMITS_LOCKED(p, memlimit_inactive, memlimit_inactive_is_fatal);
2558
2559 /*
2560 * Initialize the cached limits for target process.
2561 * When the target process is dirty tracked, it's typically
2562 * in a clean state. Non dirty tracked processes are
2563 * typically active (Foreground or above).
2564 * But just in case, we don't make assumptions...
2565 */
2566
2567 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
2568 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
2569 use_active = TRUE;
2570 } else {
2571 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
2572 use_active = FALSE;
2573 }
2574
2575 /*
2576 * Enforce the cached limit by writing to the ledger.
2577 */
2578 if (memorystatus_highwater_enabled) {
2579 /* apply now */
2580 task_set_phys_footprint_limit_internal(p->task, ((p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1), NULL, use_active, is_fatal);
2581
2582 MEMORYSTATUS_DEBUG(3, "memorystatus_update: init: limit on pid %d (%dMB %s) targeting priority(%d) dirty?=0x%x %s\n",
2583 p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
2584 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), priority, p->p_memstat_dirty,
2585 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
2586 }
2587 }
2588
2589 /*
2590 * We can't add to the aging bands buckets here.
2591 * But, we could be removing it from those buckets.
2592 * Check and take appropriate steps if so.
2593 */
2594
2595 if (isProcessInAgingBands(p)) {
2596 if ((jetsam_aging_policy != kJetsamAgingPolicyLegacy) && isApp(p) && (priority > applications_aging_band)) {
2597 /*
2598 * Runningboardd is pulling up an application that is in the aging band.
2599 * We reset the app's state here so that it'll get a fresh stay in the
2600 * aging band on the way back.
2601 *
2602 * We always handled the app 'aging' in the memorystatus_update_priority_locked()
2603 * function. Daemons used to be handled via the dirty 'set/clear/track' path.
2604 * But with extensions (daemon-app hybrid), runningboardd is now going through
2605 * this routine for daemons too and things have gotten a bit tangled. This should
2606 * be simplified/untangled at some point and might require some assistance from
2607 * runningboardd.
2608 */
2609 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
2610 } else {
2611 memorystatus_invalidate_idle_demotion_locked(p, FALSE);
2612 }
2613 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
2614 } else {
2615 if (jetsam_aging_policy == kJetsamAgingPolicyLegacy && priority == JETSAM_PRIORITY_IDLE) {
2616 /*
2617 * Daemons with 'inactive' limits will go through the dirty tracking codepath.
2618 * This path deals with apps that may have 'inactive' limits e.g. WebContent processes.
2619 * If this is the legacy aging policy we explicitly need to apply those limits. If it
2620 * is any other aging policy, then we don't need to worry because all processes
2621 * will go through the aging bands and then the demotion thread will take care to
2622 * move them into the IDLE band and apply the required limits.
2623 */
2624 memorystatus_update_priority_locked(p, priority, head_insert, TRUE);
2625 }
2626 }
2627
2628 memorystatus_update_priority_locked(p, priority, head_insert, FALSE);
2629
2630 proc_list_unlock();
2631 ret = 0;
2632
2633 out:
2634 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_END, ret, 0, 0, 0, 0);
2635
2636 return ret;
2637 }
2638
2639 int
2640 memorystatus_remove(proc_t p)
2641 {
2642 int ret;
2643 memstat_bucket_t *bucket;
2644 boolean_t reschedule = FALSE;
2645
2646 MEMORYSTATUS_DEBUG(1, "memorystatus_list_remove: removing pid %d\n", p->p_pid);
2647
2648 /*
2649 * Check if this proc is locked (because we're performing a freeze).
2650 * If so, we fail and instruct the caller to try again later.
2651 */
2652 if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
2653 return EAGAIN;
2654 }
2655
2656 assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
2657
2658 bucket = &memstat_bucket[p->p_memstat_effectivepriority];
2659
2660 if (isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) {
2661 assert(bucket->count == memorystatus_scheduled_idle_demotions_sysprocs);
2662 reschedule = TRUE;
2663 } else if (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)) {
2664 assert(bucket->count == memorystatus_scheduled_idle_demotions_apps);
2665 reschedule = TRUE;
2666 }
2667
2668 /*
2669 * Record idle delta
2670 */
2671
2672 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2673 uint64_t now = mach_absolute_time();
2674 if (now > p->p_memstat_idle_start) {
2675 p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
2676 }
2677 }
2678
2679 TAILQ_REMOVE(&bucket->list, p, p_memstat_list);
2680 bucket->count--;
2681 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2682 bucket->relaunch_high_count--;
2683 }
2684
2685 memorystatus_list_count--;
2686
2687 /* If awaiting demotion to the idle band, clean up */
2688 if (reschedule) {
2689 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
2690 memorystatus_reschedule_idle_demotion_locked();
2691 }
2692
2693 memorystatus_check_levels_locked();
2694
2695 #if CONFIG_FREEZE
2696 if (p->p_memstat_state & (P_MEMSTAT_FROZEN)) {
2697 if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
2698 p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE;
2699 memorystatus_refreeze_eligible_count--;
2700 }
2701
2702 memorystatus_frozen_count--;
2703 memorystatus_frozen_shared_mb -= p->p_memstat_freeze_sharedanon_pages;
2704 p->p_memstat_freeze_sharedanon_pages = 0;
2705 }
2706
2707 if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
2708 memorystatus_suspended_count--;
2709 }
2710 #endif
2711
2712 #if DEVELOPMENT || DEBUG
2713 if (p->p_pid == memorystatus_snapshot_owner) {
2714 memorystatus_snapshot_owner = 0;
2715 }
2716 #endif /* DEVELOPMENT || DEBUG */
2717
2718 if (p) {
2719 ret = 0;
2720 } else {
2721 ret = ESRCH;
2722 }
2723
2724 return ret;
2725 }
2726
2727 /*
2728 * Validate dirty tracking flags with process state.
2729 *
2730 * Return:
2731 * 0 on success
2732 * non-0 on failure
2733 *
2734 * The proc_list_lock is held by the caller.
2735 */
2736
2737 static int
2738 memorystatus_validate_track_flags(struct proc *target_p, uint32_t pcontrol)
2739 {
2740 /* See that the process isn't marked for termination */
2741 if (target_p->p_memstat_dirty & P_DIRTY_TERMINATED) {
2742 return EBUSY;
2743 }
2744
2745 /* Idle exit requires that process be tracked */
2746 if ((pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) &&
2747 !(pcontrol & PROC_DIRTY_TRACK)) {
2748 return EINVAL;
2749 }
2750
2751 /* 'Launch in progress' tracking requires that process have enabled dirty tracking too. */
2752 if ((pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) &&
2753 !(pcontrol & PROC_DIRTY_TRACK)) {
2754 return EINVAL;
2755 }
2756
2757 /* Only one type of DEFER behavior is allowed.*/
2758 if ((pcontrol & PROC_DIRTY_DEFER) &&
2759 (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) {
2760 return EINVAL;
2761 }
2762
2763 /* Deferral is only relevant if idle exit is specified */
2764 if (((pcontrol & PROC_DIRTY_DEFER) ||
2765 (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) &&
2766 !(pcontrol & PROC_DIRTY_ALLOWS_IDLE_EXIT)) {
2767 return EINVAL;
2768 }
2769
2770 return 0;
2771 }
2772
2773 static void
2774 memorystatus_update_idle_priority_locked(proc_t p)
2775 {
2776 int32_t priority;
2777
2778 MEMORYSTATUS_DEBUG(1, "memorystatus_update_idle_priority_locked(): pid %d dirty 0x%X\n", p->p_pid, p->p_memstat_dirty);
2779
2780 assert(isSysProc(p));
2781
2782 if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED | P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED) {
2783 priority = (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) ? system_procs_aging_band : JETSAM_PRIORITY_IDLE;
2784 } else {
2785 priority = p->p_memstat_requestedpriority;
2786 }
2787
2788 if (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) {
2789 /*
2790 * This process has a jetsam priority managed by an assertion.
2791 * Policy is to choose the max priority.
2792 */
2793 if (p->p_memstat_assertionpriority > priority) {
2794 os_log(OS_LOG_DEFAULT, "memorystatus: assertion priority %d overrides priority %d for %s:%d\n",
2795 p->p_memstat_assertionpriority, priority,
2796 (*p->p_name ? p->p_name : "unknown"), p->p_pid);
2797 priority = p->p_memstat_assertionpriority;
2798 }
2799 }
2800
2801 if (priority != p->p_memstat_effectivepriority) {
2802 if ((jetsam_aging_policy == kJetsamAgingPolicyLegacy) &&
2803 (priority == JETSAM_PRIORITY_IDLE)) {
2804 /*
2805 * This process is on its way into the IDLE band. The system is
2806 * using 'legacy' jetsam aging policy. That means, this process
2807 * has already used up its idle-deferral aging time that is given
2808 * once per its lifetime. So we need to set the INACTIVE limits
2809 * explicitly because it won't be going through the demotion paths
2810 * that take care to apply the limits appropriately.
2811 */
2812
2813 if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) {
2814 /*
2815 * This process has the 'elevated inactive jetsam band' attribute.
2816 * So, there will be no trip to IDLE after all.
2817 * Instead, we pin the process in the elevated band,
2818 * where its ACTIVE limits will apply.
2819 */
2820
2821 priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
2822 }
2823
2824 memorystatus_update_priority_locked(p, priority, false, true);
2825 } else {
2826 memorystatus_update_priority_locked(p, priority, false, false);
2827 }
2828 }
2829 }
2830
2831 /*
2832 * Processes can opt to have their state tracked by the kernel, indicating when they are busy (dirty) or idle
2833 * (clean). They may also indicate that they support termination when idle, with the result that they are promoted
2834 * to their desired, higher, jetsam priority when dirty (and are therefore killed later), and demoted to the low
2835 * priority idle band when clean (and killed earlier, protecting higher priority procesess).
2836 *
2837 * If the deferral flag is set, then newly tracked processes will be protected for an initial period (as determined by
2838 * memorystatus_sysprocs_idle_delay_time); if they go clean during this time, then they will be moved to a deferred-idle band
2839 * with a slightly higher priority, guarding against immediate termination under memory pressure and being unable to
2840 * make forward progress. Finally, when the guard expires, they will be moved to the standard, lowest-priority, idle
2841 * band. The deferral can be cleared early by clearing the appropriate flag.
2842 *
2843 * The deferral timer is active only for the duration that the process is marked as guarded and clean; if the process
2844 * is marked dirty, the timer will be cancelled. Upon being subsequently marked clean, the deferment will either be
2845 * re-enabled or the guard state cleared, depending on whether the guard deadline has passed.
2846 */
2847
2848 int
2849 memorystatus_dirty_track(proc_t p, uint32_t pcontrol)
2850 {
2851 unsigned int old_dirty;
2852 boolean_t reschedule = FALSE;
2853 boolean_t already_deferred = FALSE;
2854 boolean_t defer_now = FALSE;
2855 int ret = 0;
2856
2857 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_TRACK),
2858 p->p_pid, p->p_memstat_dirty, pcontrol, 0, 0);
2859
2860 proc_list_lock();
2861
2862 if ((p->p_listflag & P_LIST_EXITED) != 0) {
2863 /*
2864 * Process is on its way out.
2865 */
2866 ret = EBUSY;
2867 goto exit;
2868 }
2869
2870 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
2871 ret = EPERM;
2872 goto exit;
2873 }
2874
2875 if ((ret = memorystatus_validate_track_flags(p, pcontrol)) != 0) {
2876 /* error */
2877 goto exit;
2878 }
2879
2880 old_dirty = p->p_memstat_dirty;
2881
2882 /* These bits are cumulative, as per <rdar://problem/11159924> */
2883 if (pcontrol & PROC_DIRTY_TRACK) {
2884 p->p_memstat_dirty |= P_DIRTY_TRACK;
2885 }
2886
2887 if (pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) {
2888 p->p_memstat_dirty |= P_DIRTY_ALLOW_IDLE_EXIT;
2889 }
2890
2891 if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
2892 p->p_memstat_dirty |= P_DIRTY_LAUNCH_IN_PROGRESS;
2893 }
2894
2895 if (old_dirty & P_DIRTY_AGING_IN_PROGRESS) {
2896 already_deferred = TRUE;
2897 }
2898
2899
2900 /* This can be set and cleared exactly once. */
2901 if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) {
2902 if ((pcontrol & (PROC_DIRTY_DEFER)) &&
2903 !(old_dirty & P_DIRTY_DEFER)) {
2904 p->p_memstat_dirty |= P_DIRTY_DEFER;
2905 }
2906
2907 if ((pcontrol & (PROC_DIRTY_DEFER_ALWAYS)) &&
2908 !(old_dirty & P_DIRTY_DEFER_ALWAYS)) {
2909 p->p_memstat_dirty |= P_DIRTY_DEFER_ALWAYS;
2910 }
2911
2912 defer_now = TRUE;
2913 }
2914
2915 MEMORYSTATUS_DEBUG(1, "memorystatus_on_track_dirty(): set idle-exit %s / defer %s / dirty %s for pid %d\n",
2916 ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) ? "Y" : "N",
2917 defer_now ? "Y" : "N",
2918 p->p_memstat_dirty & P_DIRTY ? "Y" : "N",
2919 p->p_pid);
2920
2921 /* Kick off or invalidate the idle exit deferment if there's a state transition. */
2922 if (!(p->p_memstat_dirty & P_DIRTY_IS_DIRTY)) {
2923 if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
2924 if (defer_now && !already_deferred) {
2925 /*
2926 * Request to defer a clean process that's idle-exit enabled
2927 * and not already in the jetsam deferred band. Most likely a
2928 * new launch.
2929 */
2930 memorystatus_schedule_idle_demotion_locked(p, TRUE);
2931 reschedule = TRUE;
2932 } else if (!defer_now) {
2933 /*
2934 * The process isn't asking for the 'aging' facility.
2935 * Could be that it is:
2936 */
2937
2938 if (already_deferred) {
2939 /*
2940 * already in the aging bands. Traditionally,
2941 * some processes have tried to use this to
2942 * opt out of the 'aging' facility.
2943 */
2944
2945 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
2946 } else {
2947 /*
2948 * agnostic to the 'aging' facility. In that case,
2949 * we'll go ahead and opt it in because this is likely
2950 * a new launch (clean process, dirty tracking enabled)
2951 */
2952
2953 memorystatus_schedule_idle_demotion_locked(p, TRUE);
2954 }
2955
2956 reschedule = TRUE;
2957 }
2958 }
2959 } else {
2960 /*
2961 * We are trying to operate on a dirty process. Dirty processes have to
2962 * be removed from the deferred band. The question is do we reset the
2963 * deferred state or not?
2964 *
2965 * This could be a legal request like:
2966 * - this process had opted into the 'aging' band
2967 * - but it's now dirty and requests to opt out.
2968 * In this case, we remove the process from the band and reset its
2969 * state too. It'll opt back in properly when needed.
2970 *
2971 * OR, this request could be a user-space bug. E.g.:
2972 * - this process had opted into the 'aging' band when clean
2973 * - and, then issues another request to again put it into the band except
2974 * this time the process is dirty.
2975 * The process going dirty, as a transition in memorystatus_dirty_set(), will pull the process out of
2976 * the deferred band with its state intact. So our request below is no-op.
2977 * But we do it here anyways for coverage.
2978 *
2979 * memorystatus_update_idle_priority_locked()
2980 * single-mindedly treats a dirty process as "cannot be in the aging band".
2981 */
2982
2983 if (!defer_now && already_deferred) {
2984 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
2985 reschedule = TRUE;
2986 } else {
2987 boolean_t reset_state = (jetsam_aging_policy != kJetsamAgingPolicyLegacy) ? TRUE : FALSE;
2988
2989 memorystatus_invalidate_idle_demotion_locked(p, reset_state);
2990 reschedule = TRUE;
2991 }
2992 }
2993
2994 memorystatus_update_idle_priority_locked(p);
2995
2996 if (reschedule) {
2997 memorystatus_reschedule_idle_demotion_locked();
2998 }
2999
3000 ret = 0;
3001
3002 exit:
3003 proc_list_unlock();
3004
3005 return ret;
3006 }
3007
3008 int
3009 memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol)
3010 {
3011 int ret;
3012 boolean_t kill = false;
3013 boolean_t reschedule = FALSE;
3014 boolean_t was_dirty = FALSE;
3015 boolean_t now_dirty = FALSE;
3016 #if CONFIG_DIRTYSTATUS_TRACKING
3017 boolean_t notify_change = FALSE;
3018 dirty_status_change_event_t change_event;
3019 #endif
3020
3021 MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_set(): %d %d 0x%x 0x%x\n", self, p->p_pid, pcontrol, p->p_memstat_dirty);
3022 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_SET), p->p_pid, self, pcontrol, 0, 0);
3023
3024 proc_list_lock();
3025
3026 if ((p->p_listflag & P_LIST_EXITED) != 0) {
3027 /*
3028 * Process is on its way out.
3029 */
3030 ret = EBUSY;
3031 goto exit;
3032 }
3033
3034 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3035 ret = EPERM;
3036 goto exit;
3037 }
3038
3039 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
3040 was_dirty = TRUE;
3041 }
3042
3043 if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
3044 /* Dirty tracking not enabled */
3045 ret = EINVAL;
3046 } else if (pcontrol && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
3047 /*
3048 * Process is set to be terminated and we're attempting to mark it dirty.
3049 * Set for termination and marking as clean is OK - see <rdar://problem/10594349>.
3050 */
3051 ret = EBUSY;
3052 } else {
3053 int flag = (self == TRUE) ? P_DIRTY : P_DIRTY_SHUTDOWN;
3054 if (pcontrol && !(p->p_memstat_dirty & flag)) {
3055 /* Mark the process as having been dirtied at some point */
3056 p->p_memstat_dirty |= (flag | P_DIRTY_MARKED);
3057 memorystatus_dirty_count++;
3058 ret = 0;
3059 } else if ((pcontrol == 0) && (p->p_memstat_dirty & flag)) {
3060 if ((flag == P_DIRTY_SHUTDOWN) && (!(p->p_memstat_dirty & P_DIRTY))) {
3061 /* Clearing the dirty shutdown flag, and the process is otherwise clean - kill */
3062 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
3063 kill = true;
3064 } else if ((flag == P_DIRTY) && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
3065 /* Kill previously terminated processes if set clean */
3066 kill = true;
3067 }
3068 p->p_memstat_dirty &= ~flag;
3069 memorystatus_dirty_count--;
3070 ret = 0;
3071 } else {
3072 /* Already set */
3073 ret = EALREADY;
3074 }
3075 }
3076
3077 if (ret != 0) {
3078 goto exit;
3079 }
3080
3081 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
3082 now_dirty = TRUE;
3083 }
3084
3085 if ((was_dirty == TRUE && now_dirty == FALSE) ||
3086 (was_dirty == FALSE && now_dirty == TRUE)) {
3087 #if CONFIG_DIRTYSTATUS_TRACKING
3088 if (dirtystatus_tracking_enabled) {
3089 uint32_t pages = 0;
3090 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
3091 change_event.dsc_pid = p->p_pid;
3092 change_event.dsc_event_type = (now_dirty == TRUE) ? kDirtyStatusChangedDirty : kDirtyStatusChangedClean;
3093 change_event.dsc_time = mach_absolute_time();
3094 change_event.dsc_pages = pages;
3095 change_event.dsc_priority = p->p_memstat_effectivepriority;
3096 strlcpy(&change_event.dsc_process_name[0], p->p_name, sizeof(change_event.dsc_process_name));
3097 notify_change = TRUE;
3098 }
3099 #endif
3100
3101 /* Manage idle exit deferral, if applied */
3102 if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
3103 /*
3104 * Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band OR it might be heading back
3105 * there once it's clean again. For the legacy case, this only applies if it has some protection window left.
3106 * P_DIRTY_DEFER: one-time protection window given at launch
3107 * P_DIRTY_DEFER_ALWAYS: protection window given for every dirty->clean transition. Like non-legacy mode.
3108 *
3109 * Non-Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band. It will always stop over
3110 * in that band on it's way to IDLE.
3111 */
3112
3113 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
3114 /*
3115 * New dirty process i.e. "was_dirty == FALSE && now_dirty == TRUE"
3116 *
3117 * The process will move from its aging band to its higher requested
3118 * jetsam band.
3119 */
3120 boolean_t reset_state = (jetsam_aging_policy != kJetsamAgingPolicyLegacy) ? TRUE : FALSE;
3121
3122 memorystatus_invalidate_idle_demotion_locked(p, reset_state);
3123 reschedule = TRUE;
3124 } else {
3125 /*
3126 * Process is back from "dirty" to "clean".
3127 */
3128
3129 if (jetsam_aging_policy == kJetsamAgingPolicyLegacy) {
3130 if (((p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) == FALSE) &&
3131 (mach_absolute_time() >= p->p_memstat_idledeadline)) {
3132 /*
3133 * The process' hasn't enrolled in the "always defer after dirty"
3134 * mode and its deadline has expired. It currently
3135 * does not reside in any of the aging buckets.
3136 *
3137 * It's on its way to the JETSAM_PRIORITY_IDLE
3138 * bucket via memorystatus_update_idle_priority_locked()
3139 * below.
3140 *
3141 * So all we need to do is reset all the state on the
3142 * process that's related to the aging bucket i.e.
3143 * the AGING_IN_PROGRESS flag and the timer deadline.
3144 */
3145
3146 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
3147 reschedule = TRUE;
3148 } else {
3149 /*
3150 * Process enrolled in "always stop in deferral band after dirty" OR
3151 * it still has some protection window left and so
3152 * we just re-arm the timer without modifying any
3153 * state on the process iff it still wants into that band.
3154 */
3155
3156 if (p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) {
3157 memorystatus_schedule_idle_demotion_locked(p, TRUE);
3158 reschedule = TRUE;
3159 } else if (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) {
3160 memorystatus_schedule_idle_demotion_locked(p, FALSE);
3161 reschedule = TRUE;
3162 }
3163 }
3164 } else {
3165 memorystatus_schedule_idle_demotion_locked(p, TRUE);
3166 reschedule = TRUE;
3167 }
3168 }
3169 }
3170
3171 memorystatus_update_idle_priority_locked(p);
3172
3173 if (memorystatus_highwater_enabled) {
3174 boolean_t ledger_update_needed = TRUE;
3175 boolean_t use_active;
3176 boolean_t is_fatal;
3177 /*
3178 * We are in this path because this process transitioned between
3179 * dirty <--> clean state. Update the cached memory limits.
3180 */
3181
3182 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
3183 /*
3184 * process is pinned in elevated band
3185 * or
3186 * process is dirty
3187 */
3188 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
3189 use_active = TRUE;
3190 ledger_update_needed = TRUE;
3191 } else {
3192 /*
3193 * process is clean...but if it has opted into pressured-exit
3194 * we don't apply the INACTIVE limit till the process has aged
3195 * out and is entering the IDLE band.
3196 * See memorystatus_update_priority_locked() for that.
3197 */
3198
3199 if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) {
3200 ledger_update_needed = FALSE;
3201 } else {
3202 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
3203 use_active = FALSE;
3204 ledger_update_needed = TRUE;
3205 }
3206 }
3207
3208 /*
3209 * Enforce the new limits by writing to the ledger.
3210 *
3211 * This is a hot path and holding the proc_list_lock while writing to the ledgers,
3212 * (where the task lock is taken) is bad. So, we temporarily drop the proc_list_lock.
3213 * We aren't traversing the jetsam bucket list here, so we should be safe.
3214 * See rdar://21394491.
3215 */
3216
3217 if (ledger_update_needed && proc_ref_locked(p) == p) {
3218 int ledger_limit;
3219 if (p->p_memstat_memlimit > 0) {
3220 ledger_limit = p->p_memstat_memlimit;
3221 } else {
3222 ledger_limit = -1;
3223 }
3224 proc_list_unlock();
3225 task_set_phys_footprint_limit_internal(p->task, ledger_limit, NULL, use_active, is_fatal);
3226 proc_list_lock();
3227 proc_rele_locked(p);
3228
3229 MEMORYSTATUS_DEBUG(3, "memorystatus_dirty_set: new limit on pid %d (%dMB %s) priority(%d) dirty?=0x%x %s\n",
3230 p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
3231 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, p->p_memstat_dirty,
3232 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
3233 }
3234 }
3235
3236 /* If the deferral state changed, reschedule the demotion timer */
3237 if (reschedule) {
3238 memorystatus_reschedule_idle_demotion_locked();
3239 }
3240 }
3241
3242 if (kill) {
3243 if (proc_ref_locked(p) == p) {
3244 proc_list_unlock();
3245 psignal(p, SIGKILL);
3246 proc_list_lock();
3247 proc_rele_locked(p);
3248 }
3249 }
3250
3251 exit:
3252 proc_list_unlock();
3253
3254 #if CONFIG_DIRTYSTATUS_TRACKING
3255 // Before returning, let's notify the dirtiness status if we have to
3256 if (notify_change) {
3257 memorystatus_send_dirty_status_change_note(&change_event, sizeof(change_event));
3258 }
3259 #endif
3260
3261 return ret;
3262 }
3263
3264 int
3265 memorystatus_dirty_clear(proc_t p, uint32_t pcontrol)
3266 {
3267 int ret = 0;
3268
3269 MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_clear(): %d 0x%x 0x%x\n", p->p_pid, pcontrol, p->p_memstat_dirty);
3270
3271 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_CLEAR), p->p_pid, pcontrol, 0, 0, 0);
3272
3273 proc_list_lock();
3274
3275 if ((p->p_listflag & P_LIST_EXITED) != 0) {
3276 /*
3277 * Process is on its way out.
3278 */
3279 ret = EBUSY;
3280 goto exit;
3281 }
3282
3283 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3284 ret = EPERM;
3285 goto exit;
3286 }
3287
3288 if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
3289 /* Dirty tracking not enabled */
3290 ret = EINVAL;
3291 goto exit;
3292 }
3293
3294 if (!pcontrol || (pcontrol & (PROC_DIRTY_LAUNCH_IN_PROGRESS | PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) == 0) {
3295 ret = EINVAL;
3296 goto exit;
3297 }
3298
3299 if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
3300 p->p_memstat_dirty &= ~P_DIRTY_LAUNCH_IN_PROGRESS;
3301 }
3302
3303 /* This can be set and cleared exactly once. */
3304 if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) {
3305 if (p->p_memstat_dirty & P_DIRTY_DEFER) {
3306 p->p_memstat_dirty &= ~(P_DIRTY_DEFER);
3307 }
3308
3309 if (p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) {
3310 p->p_memstat_dirty &= ~(P_DIRTY_DEFER_ALWAYS);
3311 }
3312
3313 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
3314 memorystatus_update_idle_priority_locked(p);
3315 memorystatus_reschedule_idle_demotion_locked();
3316 }
3317
3318 ret = 0;
3319 exit:
3320 proc_list_unlock();
3321
3322 return ret;
3323 }
3324
3325 int
3326 memorystatus_dirty_get(proc_t p, boolean_t locked)
3327 {
3328 int ret = 0;
3329
3330 if (!locked) {
3331 proc_list_lock();
3332 }
3333
3334 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
3335 ret |= PROC_DIRTY_TRACKED;
3336 if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) {
3337 ret |= PROC_DIRTY_ALLOWS_IDLE_EXIT;
3338 }
3339 if (p->p_memstat_dirty & P_DIRTY) {
3340 ret |= PROC_DIRTY_IS_DIRTY;
3341 }
3342 if (p->p_memstat_dirty & P_DIRTY_LAUNCH_IN_PROGRESS) {
3343 ret |= PROC_DIRTY_LAUNCH_IS_IN_PROGRESS;
3344 }
3345 }
3346
3347 if (!locked) {
3348 proc_list_unlock();
3349 }
3350
3351 return ret;
3352 }
3353
3354 int
3355 memorystatus_on_terminate(proc_t p)
3356 {
3357 int sig;
3358
3359 proc_list_lock();
3360
3361 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
3362
3363 if (((p->p_memstat_dirty & (P_DIRTY_TRACK | P_DIRTY_IS_DIRTY)) == P_DIRTY_TRACK) ||
3364 (p->p_memstat_state & P_MEMSTAT_SUSPENDED)) {
3365 /*
3366 * Mark as terminated and issue SIGKILL if:-
3367 * - process is clean, or,
3368 * - if process is dirty but suspended. This case is likely
3369 * an extension because apps don't opt into dirty-tracking
3370 * and daemons aren't suspended.
3371 */
3372 #if DEVELOPMENT || DEBUG
3373 if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
3374 os_log(OS_LOG_DEFAULT, "memorystatus: sending suspended process %s (pid %d) SIGKILL",
3375 (*p->p_name ? p->p_name : "unknown"), p->p_pid);
3376 }
3377 #endif /* DEVELOPMENT || DEBUG */
3378 sig = SIGKILL;
3379 } else {
3380 /* Dirty, terminated, or state tracking is unsupported; issue SIGTERM to allow cleanup */
3381 sig = SIGTERM;
3382 }
3383
3384 proc_list_unlock();
3385
3386 return sig;
3387 }
3388
3389 void
3390 memorystatus_on_suspend(proc_t p)
3391 {
3392 #if CONFIG_FREEZE
3393 uint32_t pages;
3394 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
3395 #endif
3396 proc_list_lock();
3397 #if CONFIG_FREEZE
3398 memorystatus_suspended_count++;
3399 #endif
3400 p->p_memstat_state |= P_MEMSTAT_SUSPENDED;
3401 proc_list_unlock();
3402 }
3403
3404 extern uint64_t memorystatus_thaw_count_since_boot;
3405
3406 void
3407 memorystatus_on_resume(proc_t p)
3408 {
3409 #if CONFIG_FREEZE
3410 boolean_t frozen;
3411 pid_t pid;
3412 #endif
3413
3414 proc_list_lock();
3415
3416 #if CONFIG_FREEZE
3417 frozen = (p->p_memstat_state & P_MEMSTAT_FROZEN);
3418 if (frozen) {
3419 /*
3420 * Now that we don't _thaw_ a process completely,
3421 * resuming it (and having some on-demand swapins)
3422 * shouldn't preclude it from being counted as frozen.
3423 *
3424 * memorystatus_frozen_count--;
3425 *
3426 * We preserve the P_MEMSTAT_FROZEN state since the process
3427 * could have state on disk AND so will deserve some protection
3428 * in the jetsam bands.
3429 */
3430 if ((p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) == 0) {
3431 p->p_memstat_state |= P_MEMSTAT_REFREEZE_ELIGIBLE;
3432 memorystatus_refreeze_eligible_count++;
3433 }
3434 p->p_memstat_thaw_count++;
3435
3436 memorystatus_thaw_count++;
3437 memorystatus_thaw_count_since_boot++;
3438 }
3439
3440 memorystatus_suspended_count--;
3441
3442 pid = p->p_pid;
3443 #endif
3444
3445 /*
3446 * P_MEMSTAT_FROZEN will remain unchanged. This used to be:
3447 * p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN);
3448 */
3449 p->p_memstat_state &= ~P_MEMSTAT_SUSPENDED;
3450
3451 proc_list_unlock();
3452
3453 #if CONFIG_FREEZE
3454 if (frozen) {
3455 memorystatus_freeze_entry_t data = { pid, FALSE, 0 };
3456 memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
3457 }
3458 #endif
3459 }
3460
3461 void
3462 memorystatus_on_inactivity(proc_t p)
3463 {
3464 #pragma unused(p)
3465 #if CONFIG_FREEZE
3466 /* Wake the freeze thread */
3467 thread_wakeup((event_t)&memorystatus_freeze_wakeup);
3468 #endif
3469 }
3470
3471 /*
3472 * The proc_list_lock is held by the caller.
3473 */
3474 static uint32_t
3475 memorystatus_build_state(proc_t p)
3476 {
3477 uint32_t snapshot_state = 0;
3478
3479 /* General */
3480 if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
3481 snapshot_state |= kMemorystatusSuspended;
3482 }
3483 if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
3484 snapshot_state |= kMemorystatusFrozen;
3485 }
3486 if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
3487 snapshot_state |= kMemorystatusWasThawed;
3488 }
3489 if (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) {
3490 snapshot_state |= kMemorystatusAssertion;
3491 }
3492
3493 /* Tracking */
3494 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
3495 snapshot_state |= kMemorystatusTracked;
3496 }
3497 if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
3498 snapshot_state |= kMemorystatusSupportsIdleExit;
3499 }
3500 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
3501 snapshot_state |= kMemorystatusDirty;
3502 }
3503
3504 return snapshot_state;
3505 }
3506
3507 static boolean_t
3508 kill_idle_exit_proc(void)
3509 {
3510 proc_t p, victim_p = PROC_NULL;
3511 uint64_t current_time, footprint_of_killed_proc;
3512 boolean_t killed = FALSE;
3513 unsigned int i = 0;
3514 os_reason_t jetsam_reason = OS_REASON_NULL;
3515
3516 /* Pick next idle exit victim. */
3517 current_time = mach_absolute_time();
3518
3519 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_IDLE_EXIT);
3520 if (jetsam_reason == OS_REASON_NULL) {
3521 printf("kill_idle_exit_proc: failed to allocate jetsam reason\n");
3522 }
3523
3524 proc_list_lock();
3525
3526 p = memorystatus_get_first_proc_locked(&i, FALSE);
3527 while (p) {
3528 /* No need to look beyond the idle band */
3529 if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
3530 break;
3531 }
3532
3533 if ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT | P_DIRTY_IS_DIRTY | P_DIRTY_TERMINATED)) == (P_DIRTY_ALLOW_IDLE_EXIT)) {
3534 if (current_time >= p->p_memstat_idledeadline) {
3535 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
3536 victim_p = proc_ref_locked(p);
3537 break;
3538 }
3539 }
3540
3541 p = memorystatus_get_next_proc_locked(&i, p, FALSE);
3542 }
3543
3544 proc_list_unlock();
3545
3546 if (victim_p) {
3547 printf("memorystatus: killing_idle_process pid %d [%s] jetsam_reason->osr_code: %llu\n", victim_p->p_pid, (*victim_p->p_name ? victim_p->p_name : "unknown"), jetsam_reason->osr_code);
3548 killed = memorystatus_do_kill(victim_p, kMemorystatusKilledIdleExit, jetsam_reason, &footprint_of_killed_proc);
3549 proc_rele(victim_p);
3550 } else {
3551 os_reason_free(jetsam_reason);
3552 }
3553
3554 return killed;
3555 }
3556
3557 static void
3558 memorystatus_thread_wake(void)
3559 {
3560 int thr_id = 0;
3561 int active_thr = atomic_load(&active_jetsam_threads);
3562
3563 /* Wakeup all the jetsam threads */
3564 for (thr_id = 0; thr_id < active_thr; thr_id++) {
3565 thread_wakeup((event_t)&jetsam_threads[thr_id].memorystatus_wakeup);
3566 }
3567 }
3568
3569 #if CONFIG_JETSAM
3570
3571 static void
3572 memorystatus_thread_pool_max()
3573 {
3574 /* Increase the jetsam thread pool to max_jetsam_threads */
3575 int max_threads = max_jetsam_threads;
3576 printf("Expanding memorystatus pool to %d!\n", max_threads);
3577 atomic_store(&active_jetsam_threads, max_threads);
3578 }
3579
3580 static void
3581 memorystatus_thread_pool_default()
3582 {
3583 /* Restore the jetsam thread pool to a single thread */
3584 printf("Reverting memorystatus pool back to 1\n");
3585 atomic_store(&active_jetsam_threads, 1);
3586 }
3587
3588 #endif /* CONFIG_JETSAM */
3589
3590 extern void vm_pressure_response(void);
3591
3592 static int
3593 memorystatus_thread_block(uint32_t interval_ms, thread_continue_t continuation)
3594 {
3595 struct jetsam_thread_state *jetsam_thread = jetsam_current_thread();
3596
3597 assert(jetsam_thread != NULL);
3598 if (interval_ms) {
3599 assert_wait_timeout(&jetsam_thread->memorystatus_wakeup, THREAD_UNINT, interval_ms, NSEC_PER_MSEC);
3600 } else {
3601 assert_wait(&jetsam_thread->memorystatus_wakeup, THREAD_UNINT);
3602 }
3603
3604 return thread_block(continuation);
3605 }
3606
3607 static boolean_t
3608 memorystatus_avail_pages_below_pressure(void)
3609 {
3610 #if CONFIG_JETSAM
3611 return memorystatus_available_pages <= memorystatus_available_pages_pressure;
3612 #else /* CONFIG_JETSAM */
3613 return FALSE;
3614 #endif /* CONFIG_JETSAM */
3615 }
3616
3617 static boolean_t
3618 memorystatus_avail_pages_below_critical(void)
3619 {
3620 #if CONFIG_JETSAM
3621 return memorystatus_available_pages <= memorystatus_available_pages_critical;
3622 #else /* CONFIG_JETSAM */
3623 return FALSE;
3624 #endif /* CONFIG_JETSAM */
3625 }
3626
3627 static boolean_t
3628 memorystatus_post_snapshot(int32_t priority, uint32_t cause)
3629 {
3630 boolean_t is_idle_priority;
3631
3632 if (jetsam_aging_policy == kJetsamAgingPolicyLegacy) {
3633 is_idle_priority = (priority == JETSAM_PRIORITY_IDLE);
3634 } else {
3635 is_idle_priority = (priority == JETSAM_PRIORITY_IDLE || priority == JETSAM_PRIORITY_IDLE_DEFERRED);
3636 }
3637 #if CONFIG_JETSAM
3638 #pragma unused(cause)
3639 /*
3640 * Don't generate logs for steady-state idle-exit kills,
3641 * unless it is overridden for debug or by the device
3642 * tree.
3643 */
3644
3645 return !is_idle_priority || memorystatus_idle_snapshot;
3646
3647 #else /* CONFIG_JETSAM */
3648 /*
3649 * Don't generate logs for steady-state idle-exit kills,
3650 * unless
3651 * - it is overridden for debug or by the device
3652 * tree.
3653 * OR
3654 * - the kill causes are important i.e. not kMemorystatusKilledIdleExit
3655 */
3656
3657 boolean_t snapshot_eligible_kill_cause = (is_reason_thrashing(cause) || is_reason_zone_map_exhaustion(cause));
3658 return !is_idle_priority || memorystatus_idle_snapshot || snapshot_eligible_kill_cause;
3659 #endif /* CONFIG_JETSAM */
3660 }
3661
3662 static boolean_t
3663 memorystatus_action_needed(void)
3664 {
3665 #if CONFIG_JETSAM
3666 return is_reason_thrashing(kill_under_pressure_cause) ||
3667 is_reason_zone_map_exhaustion(kill_under_pressure_cause) ||
3668 memorystatus_available_pages <= memorystatus_available_pages_pressure;
3669 #else /* CONFIG_JETSAM */
3670 return is_reason_thrashing(kill_under_pressure_cause) ||
3671 is_reason_zone_map_exhaustion(kill_under_pressure_cause);
3672 #endif /* CONFIG_JETSAM */
3673 }
3674
3675 static boolean_t
3676 memorystatus_act_on_hiwat_processes(uint32_t *errors, uint32_t *hwm_kill, boolean_t *post_snapshot, __unused boolean_t *is_critical, uint64_t *memory_reclaimed)
3677 {
3678 boolean_t purged = FALSE, killed = FALSE;
3679
3680 *memory_reclaimed = 0;
3681 killed = memorystatus_kill_hiwat_proc(errors, &purged, memory_reclaimed);
3682
3683 if (killed) {
3684 *hwm_kill = *hwm_kill + 1;
3685 *post_snapshot = TRUE;
3686 return TRUE;
3687 } else {
3688 if (purged == FALSE) {
3689 /* couldn't purge and couldn't kill */
3690 memorystatus_hwm_candidates = FALSE;
3691 }
3692 }
3693
3694 #if CONFIG_JETSAM
3695 /* No highwater processes to kill. Continue or stop for now? */
3696 if (!is_reason_thrashing(kill_under_pressure_cause) &&
3697 !is_reason_zone_map_exhaustion(kill_under_pressure_cause) &&
3698 (memorystatus_available_pages > memorystatus_available_pages_critical)) {
3699 /*
3700 * We are _not_ out of pressure but we are above the critical threshold and there's:
3701 * - no compressor thrashing
3702 * - enough zone memory
3703 * - no more HWM processes left.
3704 * For now, don't kill any other processes.
3705 */
3706
3707 if (*hwm_kill == 0) {
3708 memorystatus_thread_wasted_wakeup++;
3709 }
3710
3711 *is_critical = FALSE;
3712
3713 return TRUE;
3714 }
3715 #endif /* CONFIG_JETSAM */
3716
3717 return FALSE;
3718 }
3719
3720 /*
3721 * kJetsamHighRelaunchCandidatesThreshold defines the percentage of candidates
3722 * in the idle & deferred bands that need to be bad candidates in order to trigger
3723 * aggressive jetsam.
3724 */
3725 #define kJetsamHighRelaunchCandidatesThreshold (100)
3726
3727 /* kJetsamMinCandidatesThreshold defines the minimum number of candidates in the
3728 * idle/deferred bands to trigger aggressive jetsam. This value basically decides
3729 * how much memory the system is ready to hold in the lower bands without triggering
3730 * aggressive jetsam. This number should ideally be tuned based on the memory config
3731 * of the device.
3732 */
3733 #define kJetsamMinCandidatesThreshold (5)
3734
3735 static boolean_t
3736 memorystatus_aggressive_jetsam_needed_sysproc_aging(__unused int jld_eval_aggressive_count, __unused int *jld_idle_kills, __unused int jld_idle_kill_candidates, int *total_candidates, int *elevated_bucket_count)
3737 {
3738 boolean_t aggressive_jetsam_needed = false;
3739
3740 /*
3741 * For the kJetsamAgingPolicySysProcsReclaimedFirst aging policy, we maintain the jetsam
3742 * relaunch behavior for all daemons. Also, daemons and apps are aged in deferred bands on
3743 * every dirty->clean transition. For this aging policy, the best way to determine if
3744 * aggressive jetsam is needed, is to see if the kill candidates are mostly bad candidates.
3745 * If yes, then we need to go to higher bands to reclaim memory.
3746 */
3747 proc_list_lock();
3748 /* Get total candidate counts for idle and idle deferred bands */
3749 *total_candidates = memstat_bucket[JETSAM_PRIORITY_IDLE].count + memstat_bucket[system_procs_aging_band].count;
3750 /* Get counts of bad kill candidates in idle and idle deferred bands */
3751 int bad_candidates = memstat_bucket[JETSAM_PRIORITY_IDLE].relaunch_high_count + memstat_bucket[system_procs_aging_band].relaunch_high_count;
3752
3753 *elevated_bucket_count = memstat_bucket[JETSAM_PRIORITY_ELEVATED_INACTIVE].count;
3754
3755 proc_list_unlock();
3756
3757 /* Check if the number of bad candidates is greater than kJetsamHighRelaunchCandidatesThreshold % */
3758 aggressive_jetsam_needed = (((bad_candidates * 100) / *total_candidates) >= kJetsamHighRelaunchCandidatesThreshold);
3759
3760 /*
3761 * Since the new aging policy bases the aggressive jetsam trigger on percentage of
3762 * bad candidates, it is prone to being overly aggressive. In order to mitigate that,
3763 * make sure the system is really under memory pressure before triggering aggressive
3764 * jetsam.
3765 */
3766 if (memorystatus_available_pages > memorystatus_sysproc_aging_aggr_pages) {
3767 aggressive_jetsam_needed = false;
3768 }
3769
3770 #if DEVELOPMENT || DEBUG
3771 printf("memorystatus: aggressive%d: [%s] Bad Candidate Threshold Check (total: %d, bad: %d, threshold: %d %%); Memory Pressure Check (available_pgs: %llu, threshold_pgs: %llu)\n",
3772 jld_eval_aggressive_count, aggressive_jetsam_needed ? "PASSED" : "FAILED", *total_candidates, bad_candidates,
3773 kJetsamHighRelaunchCandidatesThreshold, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, (uint64_t)memorystatus_sysproc_aging_aggr_pages);
3774 #endif /* DEVELOPMENT || DEBUG */
3775 return aggressive_jetsam_needed;
3776 }
3777
3778 /*
3779 * Gets memory back from various system caches.
3780 * Called before jetsamming in the foreground band in the hope that we'll
3781 * avoid a jetsam.
3782 */
3783 static void
3784 memorystatus_approaching_fg_band(boolean_t *corpse_list_purged)
3785 {
3786 assert(corpse_list_purged != NULL);
3787 pmap_release_pages_fast();
3788 memorystatus_issue_fg_band_notify();
3789 if (total_corpses_count() > 0 && !*corpse_list_purged) {
3790 task_purge_all_corpses();
3791 *corpse_list_purged = TRUE;
3792 }
3793 }
3794
3795 static boolean_t
3796 memorystatus_aggressive_jetsam_needed_default(__unused int jld_eval_aggressive_count, int *jld_idle_kills, int jld_idle_kill_candidates, int *total_candidates, int *elevated_bucket_count)
3797 {
3798 boolean_t aggressive_jetsam_needed = false;
3799 /* Jetsam Loop Detection - locals */
3800 memstat_bucket_t *bucket;
3801 int jld_bucket_count = 0;
3802
3803 proc_list_lock();
3804 switch (jetsam_aging_policy) {
3805 case kJetsamAgingPolicyLegacy:
3806 bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
3807 jld_bucket_count = bucket->count;
3808 bucket = &memstat_bucket[JETSAM_PRIORITY_AGING_BAND1];
3809 jld_bucket_count += bucket->count;
3810 break;
3811 case kJetsamAgingPolicyAppsReclaimedFirst:
3812 bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
3813 jld_bucket_count = bucket->count;
3814 bucket = &memstat_bucket[system_procs_aging_band];
3815 jld_bucket_count += bucket->count;
3816 bucket = &memstat_bucket[applications_aging_band];
3817 jld_bucket_count += bucket->count;
3818 break;
3819 case kJetsamAgingPolicyNone:
3820 default:
3821 bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
3822 jld_bucket_count = bucket->count;
3823 break;
3824 }
3825
3826 bucket = &memstat_bucket[JETSAM_PRIORITY_ELEVATED_INACTIVE];
3827 *elevated_bucket_count = bucket->count;
3828 *total_candidates = jld_bucket_count;
3829 proc_list_unlock();
3830
3831 aggressive_jetsam_needed = (*jld_idle_kills > jld_idle_kill_candidates);
3832
3833 #if DEVELOPMENT || DEBUG
3834 if (aggressive_jetsam_needed) {
3835 printf("memorystatus: aggressive%d: idle candidates: %d, idle kills: %d\n",
3836 jld_eval_aggressive_count,
3837 jld_idle_kill_candidates,
3838 *jld_idle_kills);
3839 }
3840 #endif /* DEVELOPMENT || DEBUG */
3841 return aggressive_jetsam_needed;
3842 }
3843
3844 static boolean_t
3845 memorystatus_act_aggressive(uint32_t cause, os_reason_t jetsam_reason, int *jld_idle_kills, boolean_t *corpse_list_purged, boolean_t *post_snapshot, uint64_t *memory_reclaimed)
3846 {
3847 boolean_t aggressive_jetsam_needed = false;
3848 boolean_t killed;
3849 uint32_t errors = 0;
3850 uint64_t footprint_of_killed_proc = 0;
3851 int elevated_bucket_count = 0;
3852 int total_candidates = 0;
3853 *memory_reclaimed = 0;
3854
3855 /*
3856 * The aggressive jetsam logic looks at the number of times it has been in the
3857 * aggressive loop to determine the max priority band it should kill upto. The
3858 * static variables below are used to track that property.
3859 *
3860 * To reset those values, the implementation checks if it has been
3861 * memorystatus_jld_eval_period_msecs since the parameters were reset.
3862 */
3863 static int jld_eval_aggressive_count = 0;
3864 static int32_t jld_priority_band_max = JETSAM_PRIORITY_UI_SUPPORT;
3865 static uint64_t jld_timestamp_msecs = 0;
3866 static int jld_idle_kill_candidates = 0;
3867
3868 if (memorystatus_jld_enabled == FALSE) {
3869 /* If aggressive jetsam is disabled, nothing to do here */
3870 return FALSE;
3871 }
3872
3873 /* Get current timestamp (msecs only) */
3874 struct timeval jld_now_tstamp = {0, 0};
3875 uint64_t jld_now_msecs = 0;
3876 microuptime(&jld_now_tstamp);
3877 jld_now_msecs = (jld_now_tstamp.tv_sec * 1000);
3878
3879 /*
3880 * The aggressive jetsam logic looks at the number of candidates and their
3881 * properties to decide if aggressive jetsam should be engaged.
3882 */
3883 if (jetsam_aging_policy == kJetsamAgingPolicySysProcsReclaimedFirst) {
3884 /*
3885 * For the kJetsamAgingPolicySysProcsReclaimedFirst aging policy, the logic looks at the number of
3886 * candidates in the idle and deferred band and how many out of them are marked as high relaunch
3887 * probability.
3888 */
3889 aggressive_jetsam_needed = memorystatus_aggressive_jetsam_needed_sysproc_aging(jld_eval_aggressive_count,
3890 jld_idle_kills, jld_idle_kill_candidates, &total_candidates, &elevated_bucket_count);
3891 } else {
3892 /*
3893 * The other aging policies look at number of candidate processes over a specific time window and
3894 * evaluate if the system is in a jetsam loop. If yes, aggressive jetsam is triggered.
3895 */
3896 aggressive_jetsam_needed = memorystatus_aggressive_jetsam_needed_default(jld_eval_aggressive_count,
3897 jld_idle_kills, jld_idle_kill_candidates, &total_candidates, &elevated_bucket_count);
3898 }
3899
3900 /*
3901 * Check if its been really long since the aggressive jetsam evaluation
3902 * parameters have been refreshed. This logic also resets the jld_eval_aggressive_count
3903 * counter to make sure we reset the aggressive jetsam severity.
3904 */
3905 boolean_t param_reval = false;
3906
3907 if ((total_candidates == 0) ||
3908 (jld_now_msecs > (jld_timestamp_msecs + memorystatus_jld_eval_period_msecs))) {
3909 jld_timestamp_msecs = jld_now_msecs;
3910 jld_idle_kill_candidates = total_candidates;
3911 *jld_idle_kills = 0;
3912 jld_eval_aggressive_count = 0;
3913 jld_priority_band_max = JETSAM_PRIORITY_UI_SUPPORT;
3914 param_reval = true;
3915 }
3916
3917 /*
3918 * If the parameters have been updated, re-evaluate the aggressive_jetsam_needed condition for
3919 * the non kJetsamAgingPolicySysProcsReclaimedFirst policy since its based on jld_idle_kill_candidates etc.
3920 */
3921 if ((param_reval == true) && (jetsam_aging_policy != kJetsamAgingPolicySysProcsReclaimedFirst)) {
3922 aggressive_jetsam_needed = (*jld_idle_kills > jld_idle_kill_candidates);
3923 }
3924
3925 /*
3926 * It is also possible that the system is down to a very small number of processes in the candidate
3927 * bands. In that case, the decisions made by the memorystatus_aggressive_jetsam_needed_* routines
3928 * would not be useful. In that case, do not trigger aggressive jetsam.
3929 */
3930 if (total_candidates < kJetsamMinCandidatesThreshold) {
3931 #if DEVELOPMENT || DEBUG
3932 printf("memorystatus: aggressive: [FAILED] Low Candidate Count (current: %d, threshold: %d)\n", total_candidates, kJetsamMinCandidatesThreshold);
3933 #endif /* DEVELOPMENT || DEBUG */
3934 aggressive_jetsam_needed = false;
3935 }
3936
3937 if (aggressive_jetsam_needed == false) {
3938 /* Either the aging policy or the candidate count decided that aggressive jetsam is not needed. Nothing more to do here. */
3939 return FALSE;
3940 }
3941
3942 /* Looks like aggressive jetsam is needed */
3943 jld_eval_aggressive_count++;
3944
3945 if (jld_eval_aggressive_count == memorystatus_jld_eval_aggressive_count) {
3946 memorystatus_approaching_fg_band(corpse_list_purged);
3947 } else if (jld_eval_aggressive_count > memorystatus_jld_eval_aggressive_count) {
3948 /*
3949 * Bump up the jetsam priority limit (eg: the bucket index)
3950 * Enforce bucket index sanity.
3951 */
3952 if ((memorystatus_jld_eval_aggressive_priority_band_max < 0) ||
3953 (memorystatus_jld_eval_aggressive_priority_band_max >= MEMSTAT_BUCKET_COUNT)) {
3954 /*
3955 * Do nothing. Stick with the default level.
3956 */
3957 } else {
3958 jld_priority_band_max = memorystatus_jld_eval_aggressive_priority_band_max;
3959 }
3960 }
3961
3962 /* Visit elevated processes first */
3963 while (elevated_bucket_count) {
3964 elevated_bucket_count--;
3965
3966 /*
3967 * memorystatus_kill_elevated_process() drops a reference,
3968 * so take another one so we can continue to use this exit reason
3969 * even after it returns.
3970 */
3971
3972 os_reason_ref(jetsam_reason);
3973 killed = memorystatus_kill_elevated_process(
3974 cause,
3975 jetsam_reason,
3976 JETSAM_PRIORITY_ELEVATED_INACTIVE,
3977 jld_eval_aggressive_count,
3978 &errors, &footprint_of_killed_proc);
3979 if (killed) {
3980 *post_snapshot = TRUE;
3981 *memory_reclaimed += footprint_of_killed_proc;
3982 if (memorystatus_avail_pages_below_pressure()) {
3983 /*
3984 * Still under pressure.
3985 * Find another pinned processes.
3986 */
3987 continue;
3988 } else {
3989 return TRUE;
3990 }
3991 } else {
3992 /*
3993 * No pinned processes left to kill.
3994 * Abandon elevated band.
3995 */
3996 break;
3997 }
3998 }
3999
4000 /*
4001 * memorystatus_kill_processes_aggressive() allocates its own
4002 * jetsam_reason so the kMemorystatusKilledProcThrashing cause
4003 * is consistent throughout the aggressive march.
4004 */
4005 killed = memorystatus_kill_processes_aggressive(
4006 kMemorystatusKilledProcThrashing,
4007 jld_eval_aggressive_count,
4008 jld_priority_band_max,
4009 &errors, &footprint_of_killed_proc);
4010
4011 if (killed) {
4012 /* Always generate logs after aggressive kill */
4013 *post_snapshot = TRUE;
4014 *memory_reclaimed += footprint_of_killed_proc;
4015 *jld_idle_kills = 0;
4016 return TRUE;
4017 }
4018
4019 return FALSE;
4020 }
4021
4022
4023 static void
4024 memorystatus_thread(void *param __unused, wait_result_t wr __unused)
4025 {
4026 boolean_t post_snapshot = FALSE;
4027 uint32_t errors = 0;
4028 uint32_t hwm_kill = 0;
4029 boolean_t sort_flag = TRUE;
4030 boolean_t corpse_list_purged = FALSE;
4031 int jld_idle_kills = 0;
4032 struct jetsam_thread_state *jetsam_thread = jetsam_current_thread();
4033 uint64_t total_memory_reclaimed = 0;
4034
4035 assert(jetsam_thread != NULL);
4036 if (jetsam_thread->inited == FALSE) {
4037 /*
4038 * It's the first time the thread has run, so just mark the thread as privileged and block.
4039 * This avoids a spurious pass with unset variables, as set out in <rdar://problem/9609402>.
4040 */
4041
4042 char name[32];
4043 thread_wire(host_priv_self(), current_thread(), TRUE);
4044 snprintf(name, 32, "VM_memorystatus_%d", jetsam_thread->index + 1);
4045
4046 /* Limit all but one thread to the lower jetsam bands, as that's where most of the victims are. */
4047 if (jetsam_thread->index == 0) {
4048 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4049 thread_vm_bind_group_add();
4050 }
4051 jetsam_thread->limit_to_low_bands = FALSE;
4052 } else {
4053 jetsam_thread->limit_to_low_bands = TRUE;
4054 }
4055 #if CONFIG_THREAD_GROUPS
4056 thread_group_vm_add();
4057 #endif
4058 thread_set_thread_name(current_thread(), name);
4059 jetsam_thread->inited = TRUE;
4060 memorystatus_thread_block(0, memorystatus_thread);
4061 }
4062
4063 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_START,
4064 MEMORYSTATUS_LOG_AVAILABLE_PAGES, memorystatus_jld_enabled, memorystatus_jld_eval_period_msecs, memorystatus_jld_eval_aggressive_count, 0);
4065
4066 /*
4067 * Jetsam aware version.
4068 *
4069 * The VM pressure notification thread is working it's way through clients in parallel.
4070 *
4071 * So, while the pressure notification thread is targeting processes in order of
4072 * increasing jetsam priority, we can hopefully reduce / stop it's work by killing
4073 * any processes that have exceeded their highwater mark.
4074 *
4075 * If we run out of HWM processes and our available pages drops below the critical threshold, then,
4076 * we target the least recently used process in order of increasing jetsam priority (exception: the FG band).
4077 */
4078 while (memorystatus_action_needed()) {
4079 boolean_t killed;
4080 int32_t priority;
4081 uint32_t cause;
4082 uint64_t memory_reclaimed = 0;
4083 uint64_t jetsam_reason_code = JETSAM_REASON_INVALID;
4084 os_reason_t jetsam_reason = OS_REASON_NULL;
4085
4086 cause = kill_under_pressure_cause;
4087 switch (cause) {
4088 case kMemorystatusKilledFCThrashing:
4089 jetsam_reason_code = JETSAM_REASON_MEMORY_FCTHRASHING;
4090 break;
4091 case kMemorystatusKilledVMCompressorThrashing:
4092 jetsam_reason_code = JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING;
4093 break;
4094 case kMemorystatusKilledVMCompressorSpaceShortage:
4095 jetsam_reason_code = JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE;
4096 break;
4097 case kMemorystatusKilledZoneMapExhaustion:
4098 jetsam_reason_code = JETSAM_REASON_ZONE_MAP_EXHAUSTION;
4099 break;
4100 case kMemorystatusKilledVMPageShortage:
4101 /* falls through */
4102 default:
4103 jetsam_reason_code = JETSAM_REASON_MEMORY_VMPAGESHORTAGE;
4104 cause = kMemorystatusKilledVMPageShortage;
4105 break;
4106 }
4107
4108 /* Highwater */
4109 boolean_t is_critical = TRUE;
4110 if (memorystatus_act_on_hiwat_processes(&errors, &hwm_kill, &post_snapshot, &is_critical, &memory_reclaimed)) {
4111 total_memory_reclaimed += memory_reclaimed;
4112 if (is_critical == FALSE) {
4113 /*
4114 * For now, don't kill any other processes.
4115 */
4116 break;
4117 } else {
4118 goto done;
4119 }
4120 }
4121
4122 jetsam_reason = os_reason_create(OS_REASON_JETSAM, jetsam_reason_code);
4123 if (jetsam_reason == OS_REASON_NULL) {
4124 printf("memorystatus_thread: failed to allocate jetsam reason\n");
4125 }
4126
4127 /* Only unlimited jetsam threads should act aggressive */
4128 if (!jetsam_thread->limit_to_low_bands &&
4129 memorystatus_act_aggressive(cause, jetsam_reason, &jld_idle_kills, &corpse_list_purged, &post_snapshot, &memory_reclaimed)) {
4130 total_memory_reclaimed += memory_reclaimed;
4131 goto done;
4132 }
4133
4134 /*
4135 * memorystatus_kill_top_process() drops a reference,
4136 * so take another one so we can continue to use this exit reason
4137 * even after it returns
4138 */
4139 os_reason_ref(jetsam_reason);
4140
4141 /* LRU */
4142 killed = memorystatus_kill_top_process(TRUE, sort_flag, cause, jetsam_reason, &priority, &errors, &memory_reclaimed);
4143 sort_flag = FALSE;
4144
4145 if (killed) {
4146 total_memory_reclaimed += memory_reclaimed;
4147 if (memorystatus_post_snapshot(priority, cause) == TRUE) {
4148 post_snapshot = TRUE;
4149 }
4150
4151 /* Jetsam Loop Detection */
4152 if (memorystatus_jld_enabled == TRUE) {
4153 if ((priority == JETSAM_PRIORITY_IDLE) || (priority == system_procs_aging_band) || (priority == applications_aging_band)) {
4154 jld_idle_kills++;
4155 } else {
4156 /*
4157 * We've reached into bands beyond idle deferred.
4158 * We make no attempt to monitor them
4159 */
4160 }
4161 }
4162
4163 /*
4164 * If we have jetsammed a process in or above JETSAM_PRIORITY_UI_SUPPORT
4165 * then we attempt to relieve pressure by purging corpse memory and notifying
4166 * anybody wanting to know this.
4167 */
4168 if (priority >= JETSAM_PRIORITY_UI_SUPPORT) {
4169 memorystatus_approaching_fg_band(&corpse_list_purged);
4170 }
4171 goto done;
4172 }
4173
4174 if (memorystatus_avail_pages_below_critical()) {
4175 /*
4176 * Still under pressure and unable to kill a process - purge corpse memory
4177 * and get everything back from the pmap.
4178 */
4179 pmap_release_pages_fast();
4180 if (total_corpses_count() > 0) {
4181 task_purge_all_corpses();
4182 corpse_list_purged = TRUE;
4183 }
4184
4185 if (!jetsam_thread->limit_to_low_bands && memorystatus_avail_pages_below_critical()) {
4186 /*
4187 * Still under pressure and unable to kill a process - panic
4188 */
4189 panic("memorystatus_jetsam_thread: no victim! available pages:%llu\n", (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
4190 }
4191 }
4192
4193 done:
4194
4195 /*
4196 * We do not want to over-kill when thrashing has been detected.
4197 * To avoid that, we reset the flag here and notify the
4198 * compressor.
4199 */
4200 if (is_reason_thrashing(kill_under_pressure_cause)) {
4201 kill_under_pressure_cause = 0;
4202 #if CONFIG_JETSAM
4203 vm_thrashing_jetsam_done();
4204 #endif /* CONFIG_JETSAM */
4205 } else if (is_reason_zone_map_exhaustion(kill_under_pressure_cause)) {
4206 kill_under_pressure_cause = 0;
4207 }
4208
4209 os_reason_free(jetsam_reason);
4210 }
4211
4212 kill_under_pressure_cause = 0;
4213
4214 if (errors) {
4215 memorystatus_clear_errors();
4216 }
4217
4218 if (post_snapshot) {
4219 proc_list_lock();
4220 size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
4221 sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count);
4222 uint64_t timestamp_now = mach_absolute_time();
4223 memorystatus_jetsam_snapshot->notification_time = timestamp_now;
4224 memorystatus_jetsam_snapshot->js_gencount++;
4225 if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
4226 timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
4227 proc_list_unlock();
4228 int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
4229 if (!ret) {
4230 proc_list_lock();
4231 memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
4232 proc_list_unlock();
4233 }
4234 } else {
4235 proc_list_unlock();
4236 }
4237 }
4238
4239 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_END,
4240 MEMORYSTATUS_LOG_AVAILABLE_PAGES, total_memory_reclaimed, 0, 0, 0);
4241
4242 memorystatus_thread_block(0, memorystatus_thread);
4243 }
4244
4245 /*
4246 * Returns TRUE:
4247 * when an idle-exitable proc was killed
4248 * Returns FALSE:
4249 * when there are no more idle-exitable procs found
4250 * when the attempt to kill an idle-exitable proc failed
4251 */
4252 boolean_t
4253 memorystatus_idle_exit_from_VM(void)
4254 {
4255 /*
4256 * This routine should no longer be needed since we are
4257 * now using jetsam bands on all platforms and so will deal
4258 * with IDLE processes within the memorystatus thread itself.
4259 *
4260 * But we still use it because we observed that macos systems
4261 * started heavy compression/swapping with a bunch of
4262 * idle-exitable processes alive and doing nothing. We decided
4263 * to rather kill those processes than start swapping earlier.
4264 */
4265
4266 return kill_idle_exit_proc();
4267 }
4268
4269 /*
4270 * Callback invoked when allowable physical memory footprint exceeded
4271 * (dirty pages + IOKit mappings)
4272 *
4273 * This is invoked for both advisory, non-fatal per-task high watermarks,
4274 * as well as the fatal task memory limits.
4275 */
4276 void
4277 memorystatus_on_ledger_footprint_exceeded(boolean_t warning, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal)
4278 {
4279 os_reason_t jetsam_reason = OS_REASON_NULL;
4280
4281 proc_t p = current_proc();
4282
4283 #if VM_PRESSURE_EVENTS
4284 if (warning == TRUE) {
4285 /*
4286 * This is a warning path which implies that the current process is close, but has
4287 * not yet exceeded its per-process memory limit.
4288 */
4289 if (memorystatus_warn_process(p, memlimit_is_active, memlimit_is_fatal, FALSE /* not exceeded */) != TRUE) {
4290 /* Print warning, since it's possible that task has not registered for pressure notifications */
4291 os_log(OS_LOG_DEFAULT, "memorystatus_on_ledger_footprint_exceeded: failed to warn the current task (%d exiting, or no handler registered?).\n", p->p_pid);
4292 }
4293 return;
4294 }
4295 #endif /* VM_PRESSURE_EVENTS */
4296
4297 if (memlimit_is_fatal) {
4298 /*
4299 * If this process has no high watermark or has a fatal task limit, then we have been invoked because the task
4300 * has violated either the system-wide per-task memory limit OR its own task limit.
4301 */
4302 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_PERPROCESSLIMIT);
4303 if (jetsam_reason == NULL) {
4304 printf("task_exceeded footprint: failed to allocate jetsam reason\n");
4305 } else if (corpse_for_fatal_memkill != 0 && proc_send_synchronous_EXC_RESOURCE(p) == FALSE) {
4306 /* Set OS_REASON_FLAG_GENERATE_CRASH_REPORT to generate corpse */
4307 jetsam_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
4308 }
4309
4310 if (memorystatus_kill_process_sync(p->p_pid, kMemorystatusKilledPerProcessLimit, jetsam_reason) != TRUE) {
4311 printf("task_exceeded_footprint: failed to kill the current task (exiting?).\n");
4312 }
4313 } else {
4314 /*
4315 * HWM offender exists. Done without locks or synchronization.
4316 * See comment near its declaration for more details.
4317 */
4318 memorystatus_hwm_candidates = TRUE;
4319
4320 #if VM_PRESSURE_EVENTS
4321 /*
4322 * The current process is not in the warning path.
4323 * This path implies the current process has exceeded a non-fatal (soft) memory limit.
4324 * Failure to send note is ignored here.
4325 */
4326 (void)memorystatus_warn_process(p, memlimit_is_active, memlimit_is_fatal, TRUE /* exceeded */);
4327
4328 #endif /* VM_PRESSURE_EVENTS */
4329 }
4330 }
4331
4332 void
4333 memorystatus_log_exception(const int max_footprint_mb, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal)
4334 {
4335 proc_t p = current_proc();
4336
4337 /*
4338 * The limit violation is logged here, but only once per process per limit.
4339 * Soft memory limit is a non-fatal high-water-mark
4340 * Hard memory limit is a fatal custom-task-limit or system-wide per-task memory limit.
4341 */
4342
4343 os_log_with_startup_serial(OS_LOG_DEFAULT, "EXC_RESOURCE -> %s[%d] exceeded mem limit: %s%s %d MB (%s)\n",
4344 ((p && *p->p_name) ? p->p_name : "unknown"), (p ? p->p_pid : -1), (memlimit_is_active ? "Active" : "Inactive"),
4345 (memlimit_is_fatal ? "Hard" : "Soft"), max_footprint_mb,
4346 (memlimit_is_fatal ? "fatal" : "non-fatal"));
4347
4348 return;
4349 }
4350
4351
4352 /*
4353 * Description:
4354 * Evaluates process state to determine which limit
4355 * should be applied (active vs. inactive limit).
4356 *
4357 * Processes that have the 'elevated inactive jetsam band' attribute
4358 * are first evaluated based on their current priority band.
4359 * presently elevated ==> active
4360 *
4361 * Processes that opt into dirty tracking are evaluated
4362 * based on clean vs dirty state.
4363 * dirty ==> active
4364 * clean ==> inactive
4365 *
4366 * Process that do not opt into dirty tracking are
4367 * evalulated based on priority level.
4368 * Foreground or above ==> active
4369 * Below Foreground ==> inactive
4370 *
4371 * Return: TRUE if active
4372 * False if inactive
4373 */
4374
4375 static boolean_t
4376 proc_jetsam_state_is_active_locked(proc_t p)
4377 {
4378 if ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) &&
4379 (p->p_memstat_effectivepriority == JETSAM_PRIORITY_ELEVATED_INACTIVE)) {
4380 /*
4381 * process has the 'elevated inactive jetsam band' attribute
4382 * and process is present in the elevated band
4383 * implies active state
4384 */
4385 return TRUE;
4386 } else if (p->p_memstat_dirty & P_DIRTY_TRACK) {
4387 /*
4388 * process has opted into dirty tracking
4389 * active state is based on dirty vs. clean
4390 */
4391 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
4392 /*
4393 * process is dirty
4394 * implies active state
4395 */
4396 return TRUE;
4397 } else {
4398 /*
4399 * process is clean
4400 * implies inactive state
4401 */
4402 return FALSE;
4403 }
4404 } else if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) {
4405 /*
4406 * process is Foreground or higher
4407 * implies active state
4408 */
4409 return TRUE;
4410 } else {
4411 /*
4412 * process found below Foreground
4413 * implies inactive state
4414 */
4415 return FALSE;
4416 }
4417 }
4418
4419 static boolean_t
4420 memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason)
4421 {
4422 boolean_t res;
4423
4424 uint32_t errors = 0;
4425 uint64_t memory_reclaimed = 0;
4426
4427 if (victim_pid == -1) {
4428 /* No pid, so kill first process */
4429 res = memorystatus_kill_top_process(TRUE, TRUE, cause, jetsam_reason, NULL, &errors, &memory_reclaimed);
4430 } else {
4431 res = memorystatus_kill_specific_process(victim_pid, cause, jetsam_reason);
4432 }
4433
4434 if (errors) {
4435 memorystatus_clear_errors();
4436 }
4437
4438 if (res == TRUE) {
4439 /* Fire off snapshot notification */
4440 proc_list_lock();
4441 size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
4442 sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_count;
4443 uint64_t timestamp_now = mach_absolute_time();
4444 memorystatus_jetsam_snapshot->notification_time = timestamp_now;
4445 if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
4446 timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
4447 proc_list_unlock();
4448 int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
4449 if (!ret) {
4450 proc_list_lock();
4451 memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
4452 proc_list_unlock();
4453 }
4454 } else {
4455 proc_list_unlock();
4456 }
4457 }
4458
4459 return res;
4460 }
4461
4462 /*
4463 * Jetsam a specific process.
4464 */
4465 static boolean_t
4466 memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason)
4467 {
4468 boolean_t killed;
4469 proc_t p;
4470 uint64_t killtime = 0;
4471 uint64_t footprint_of_killed_proc;
4472 clock_sec_t tv_sec;
4473 clock_usec_t tv_usec;
4474 uint32_t tv_msec;
4475
4476 /* TODO - add a victim queue and push this into the main jetsam thread */
4477
4478 p = proc_find(victim_pid);
4479 if (!p) {
4480 os_reason_free(jetsam_reason);
4481 return FALSE;
4482 }
4483
4484 proc_list_lock();
4485
4486 if (memorystatus_jetsam_snapshot_count == 0) {
4487 memorystatus_init_jetsam_snapshot_locked(NULL, 0);
4488 }
4489
4490 killtime = mach_absolute_time();
4491 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
4492 tv_msec = tv_usec / 1000;
4493
4494 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
4495
4496 proc_list_unlock();
4497
4498 killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
4499
4500 os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: killing_specific_process pid %d [%s] (%s %d) %lluKB - memorystatus_available_pages: %llu\n",
4501 (unsigned long)tv_sec, tv_msec, victim_pid, ((p && *p->p_name) ? p->p_name : "unknown"),
4502 memorystatus_kill_cause_name[cause], (p ? p->p_memstat_effectivepriority: -1),
4503 footprint_of_killed_proc >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
4504
4505 proc_rele(p);
4506
4507 return killed;
4508 }
4509
4510
4511 /*
4512 * Toggle the P_MEMSTAT_TERMINATED state.
4513 * Takes the proc_list_lock.
4514 */
4515 void
4516 proc_memstat_terminated(proc_t p, boolean_t set)
4517 {
4518 #if DEVELOPMENT || DEBUG
4519 if (p) {
4520 proc_list_lock();
4521 if (set == TRUE) {
4522 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
4523 } else {
4524 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
4525 }
4526 proc_list_unlock();
4527 }
4528 #else
4529 #pragma unused(p, set)
4530 /*
4531 * do nothing
4532 */
4533 #endif /* DEVELOPMENT || DEBUG */
4534 return;
4535 }
4536
4537
4538 #if CONFIG_JETSAM
4539 /*
4540 * This is invoked when cpulimits have been exceeded while in fatal mode.
4541 * The jetsam_flags do not apply as those are for memory related kills.
4542 * We call this routine so that the offending process is killed with
4543 * a non-zero exit status.
4544 */
4545 void
4546 jetsam_on_ledger_cpulimit_exceeded(void)
4547 {
4548 int retval = 0;
4549 int jetsam_flags = 0; /* make it obvious */
4550 proc_t p = current_proc();
4551 os_reason_t jetsam_reason = OS_REASON_NULL;
4552
4553 printf("task_exceeded_cpulimit: killing pid %d [%s]\n",
4554 p->p_pid, (*p->p_name ? p->p_name : "(unknown)"));
4555
4556 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_CPULIMIT);
4557 if (jetsam_reason == OS_REASON_NULL) {
4558 printf("task_exceeded_cpulimit: unable to allocate memory for jetsam reason\n");
4559 }
4560
4561 retval = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
4562
4563 if (retval) {
4564 printf("task_exceeded_cpulimit: failed to kill current task (exiting?).\n");
4565 }
4566 }
4567
4568 #endif /* CONFIG_JETSAM */
4569
4570 static void
4571 memorystatus_get_task_memory_region_count(task_t task, uint64_t *count)
4572 {
4573 assert(task);
4574 assert(count);
4575
4576 *count = get_task_memory_region_count(task);
4577 }
4578
4579
4580 #define MEMORYSTATUS_VM_MAP_FORK_ALLOWED 0x100000000
4581 #define MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED 0x200000000
4582
4583 #if DEVELOPMENT || DEBUG
4584
4585 /*
4586 * Sysctl only used to test memorystatus_allowed_vm_map_fork() path.
4587 * set a new pidwatch value
4588 * or
4589 * get the current pidwatch value
4590 *
4591 * The pidwatch_val starts out with a PID to watch for in the map_fork path.
4592 * Its value is:
4593 * - OR'd with MEMORYSTATUS_VM_MAP_FORK_ALLOWED if we allow the map_fork.
4594 * - OR'd with MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED if we disallow the map_fork.
4595 * - set to -1ull if the map_fork() is aborted for other reasons.
4596 */
4597
4598 uint64_t memorystatus_vm_map_fork_pidwatch_val = 0;
4599
4600 static int sysctl_memorystatus_vm_map_fork_pidwatch SYSCTL_HANDLER_ARGS {
4601 #pragma unused(oidp, arg1, arg2)
4602
4603 uint64_t new_value = 0;
4604 uint64_t old_value = 0;
4605 int error = 0;
4606
4607 /*
4608 * The pid is held in the low 32 bits.
4609 * The 'allowed' flags are in the upper 32 bits.
4610 */
4611 old_value = memorystatus_vm_map_fork_pidwatch_val;
4612
4613 error = sysctl_io_number(req, old_value, sizeof(old_value), &new_value, NULL);
4614
4615 if (error || !req->newptr) {
4616 /*
4617 * No new value passed in.
4618 */
4619 return error;
4620 }
4621
4622 /*
4623 * A new pid was passed in via req->newptr.
4624 * Ignore any attempt to set the higher order bits.
4625 */
4626 memorystatus_vm_map_fork_pidwatch_val = new_value & 0xFFFFFFFF;
4627 printf("memorystatus: pidwatch old_value = 0x%llx, new_value = 0x%llx \n", old_value, new_value);
4628
4629 return error;
4630 }
4631
4632 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_map_fork_pidwatch, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_MASKED,
4633 0, 0, sysctl_memorystatus_vm_map_fork_pidwatch, "Q", "get/set pid watched for in vm_map_fork");
4634
4635
4636 /*
4637 * Record if a watched process fails to qualify for a vm_map_fork().
4638 */
4639 void
4640 memorystatus_abort_vm_map_fork(task_t task)
4641 {
4642 if (memorystatus_vm_map_fork_pidwatch_val != 0) {
4643 proc_t p = get_bsdtask_info(task);
4644 if (p != NULL && memorystatus_vm_map_fork_pidwatch_val == (uint64_t)p->p_pid) {
4645 memorystatus_vm_map_fork_pidwatch_val = -1ull;
4646 }
4647 }
4648 }
4649
4650 static void
4651 set_vm_map_fork_pidwatch(task_t task, uint64_t x)
4652 {
4653 if (memorystatus_vm_map_fork_pidwatch_val != 0) {
4654 proc_t p = get_bsdtask_info(task);
4655 if (p && (memorystatus_vm_map_fork_pidwatch_val == (uint64_t)p->p_pid)) {
4656 memorystatus_vm_map_fork_pidwatch_val |= x;
4657 }
4658 }
4659 }
4660
4661 #else /* DEVELOPMENT || DEBUG */
4662
4663
4664 static void
4665 set_vm_map_fork_pidwatch(task_t task, uint64_t x)
4666 {
4667 #pragma unused(task)
4668 #pragma unused(x)
4669 }
4670
4671 #endif /* DEVELOPMENT || DEBUG */
4672
4673 /*
4674 * Called during EXC_RESOURCE handling when a process exceeds a soft
4675 * memory limit. This is the corpse fork path and here we decide if
4676 * vm_map_fork will be allowed when creating the corpse.
4677 * The task being considered is suspended.
4678 *
4679 * By default, a vm_map_fork is allowed to proceed.
4680 *
4681 * A few simple policy assumptions:
4682 * If the device has a zero system-wide task limit,
4683 * then the vm_map_fork is allowed. macOS always has a zero
4684 * system wide task limit (unless overriden by a boot-arg).
4685 *
4686 * And if a process's memory footprint calculates less
4687 * than or equal to quarter of the system-wide task limit,
4688 * then the vm_map_fork is allowed. This calculation
4689 * is based on the assumption that a process can
4690 * munch memory up to the system-wide task limit.
4691 */
4692 extern boolean_t corpse_threshold_system_limit;
4693 boolean_t
4694 memorystatus_allowed_vm_map_fork(task_t task)
4695 {
4696 boolean_t is_allowed = TRUE; /* default */
4697
4698 uint64_t footprint_in_bytes;
4699 uint64_t max_allowed_bytes;
4700
4701 if (max_task_footprint_mb == 0) {
4702 set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_ALLOWED);
4703 return is_allowed;
4704 }
4705
4706 footprint_in_bytes = get_task_phys_footprint(task);
4707
4708 /*
4709 * Maximum is 1/4 of the system-wide task limit by default.
4710 */
4711 max_allowed_bytes = ((uint64_t)max_task_footprint_mb * 1024 * 1024) >> 2;
4712
4713 #if DEBUG || DEVELOPMENT
4714 if (corpse_threshold_system_limit) {
4715 max_allowed_bytes = (uint64_t)max_task_footprint_mb * (1UL << 20);
4716 }
4717 #endif /* DEBUG || DEVELOPMENT */
4718
4719 if (footprint_in_bytes > max_allowed_bytes) {
4720 printf("memorystatus disallowed vm_map_fork %lld %lld\n", footprint_in_bytes, max_allowed_bytes);
4721 set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED);
4722 return !is_allowed;
4723 }
4724
4725 set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_ALLOWED);
4726 return is_allowed;
4727 }
4728
4729 void
4730 memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages)
4731 {
4732 assert(task);
4733 assert(footprint);
4734
4735 uint64_t pages;
4736
4737 pages = (get_task_phys_footprint(task) / PAGE_SIZE_64);
4738 assert(((uint32_t)pages) == pages);
4739 *footprint = (uint32_t)pages;
4740
4741 if (max_footprint_lifetime) {
4742 pages = (get_task_phys_footprint_lifetime_max(task) / PAGE_SIZE_64);
4743 assert(((uint32_t)pages) == pages);
4744 *max_footprint_lifetime = (uint32_t)pages;
4745 }
4746 if (purgeable_pages) {
4747 pages = (get_task_purgeable_size(task) / PAGE_SIZE_64);
4748 assert(((uint32_t)pages) == pages);
4749 *purgeable_pages = (uint32_t)pages;
4750 }
4751 }
4752
4753 static void
4754 memorystatus_get_task_phys_footprint_page_counts(task_t task,
4755 uint64_t *internal_pages, uint64_t *internal_compressed_pages,
4756 uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages,
4757 uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages,
4758 uint64_t *iokit_mapped_pages, uint64_t *page_table_pages, uint64_t *frozen_to_swap_pages)
4759 {
4760 assert(task);
4761
4762 if (internal_pages) {
4763 *internal_pages = (get_task_internal(task) / PAGE_SIZE_64);
4764 }
4765
4766 if (internal_compressed_pages) {
4767 *internal_compressed_pages = (get_task_internal_compressed(task) / PAGE_SIZE_64);
4768 }
4769
4770 if (purgeable_nonvolatile_pages) {
4771 *purgeable_nonvolatile_pages = (get_task_purgeable_nonvolatile(task) / PAGE_SIZE_64);
4772 }
4773
4774 if (purgeable_nonvolatile_compressed_pages) {
4775 *purgeable_nonvolatile_compressed_pages = (get_task_purgeable_nonvolatile_compressed(task) / PAGE_SIZE_64);
4776 }
4777
4778 if (alternate_accounting_pages) {
4779 *alternate_accounting_pages = (get_task_alternate_accounting(task) / PAGE_SIZE_64);
4780 }
4781
4782 if (alternate_accounting_compressed_pages) {
4783 *alternate_accounting_compressed_pages = (get_task_alternate_accounting_compressed(task) / PAGE_SIZE_64);
4784 }
4785
4786 if (iokit_mapped_pages) {
4787 *iokit_mapped_pages = (get_task_iokit_mapped(task) / PAGE_SIZE_64);
4788 }
4789
4790 if (page_table_pages) {
4791 *page_table_pages = (get_task_page_table(task) / PAGE_SIZE_64);
4792 }
4793
4794 #if CONFIG_FREEZE
4795 if (frozen_to_swap_pages) {
4796 *frozen_to_swap_pages = (get_task_frozen_to_swap(task) / PAGE_SIZE_64);
4797 }
4798 #else /* CONFIG_FREEZE */
4799 #pragma unused(frozen_to_swap_pages)
4800 #endif /* CONFIG_FREEZE */
4801 }
4802
4803 #if CONFIG_FREEZE
4804 /*
4805 * Copies the source entry into the destination snapshot.
4806 * Returns true on success. Fails if the destination snapshot is full.
4807 * Caller must hold the proc list lock.
4808 */
4809 static bool
4810 memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_t *dst_snapshot, unsigned int dst_snapshot_size, const memorystatus_jetsam_snapshot_entry_t *src_entry)
4811 {
4812 LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
4813 assert(dst_snapshot);
4814
4815 if (dst_snapshot->entry_count == dst_snapshot_size) {
4816 /* Destination snapshot is full. Can not be updated until it is consumed. */
4817 return false;
4818 }
4819 if (dst_snapshot->entry_count == 0) {
4820 memorystatus_init_jetsam_snapshot_header(dst_snapshot);
4821 }
4822 memorystatus_jetsam_snapshot_entry_t *dst_entry = &dst_snapshot->entries[dst_snapshot->entry_count++];
4823 memcpy(dst_entry, src_entry, sizeof(memorystatus_jetsam_snapshot_entry_t));
4824 return true;
4825 }
4826 #endif /* CONFIG_FREEZE */
4827
4828 static bool
4829 memorystatus_init_jetsam_snapshot_entry_with_kill_locked(memorystatus_jetsam_snapshot_t *snapshot, proc_t p, uint32_t kill_cause, uint64_t killtime, memorystatus_jetsam_snapshot_entry_t **entry)
4830 {
4831 LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
4832 memorystatus_jetsam_snapshot_entry_t *snapshot_list = snapshot->entries;
4833 size_t i = snapshot->entry_count;
4834
4835 if (memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i], (snapshot->js_gencount)) == TRUE) {
4836 *entry = &snapshot_list[i];
4837 (*entry)->killed = kill_cause;
4838 (*entry)->jse_killtime = killtime;
4839
4840 snapshot->entry_count = i + 1;
4841 return true;
4842 }
4843 return false;
4844 }
4845
4846 /*
4847 * This routine only acts on the global jetsam event snapshot.
4848 * Updating the process's entry can race when the memorystatus_thread
4849 * has chosen to kill a process that is racing to exit on another core.
4850 */
4851 static void
4852 memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime)
4853 {
4854 memorystatus_jetsam_snapshot_entry_t *entry = NULL;
4855 memorystatus_jetsam_snapshot_t *snapshot = NULL;
4856 memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
4857
4858 unsigned int i;
4859 #if CONFIG_FREEZE
4860 bool copied_to_freezer_snapshot = false;
4861 #endif /* CONFIG_FREEZE */
4862
4863 LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
4864
4865 if (memorystatus_jetsam_snapshot_count == 0) {
4866 /*
4867 * No active snapshot.
4868 * Nothing to do.
4869 */
4870 goto exit;
4871 }
4872
4873 /*
4874 * Sanity check as this routine should only be called
4875 * from a jetsam kill path.
4876 */
4877 assert(kill_cause != 0 && killtime != 0);
4878
4879 snapshot = memorystatus_jetsam_snapshot;
4880 snapshot_list = memorystatus_jetsam_snapshot->entries;
4881
4882 for (i = 0; i < memorystatus_jetsam_snapshot_count; i++) {
4883 if (snapshot_list[i].pid == p->p_pid) {
4884 entry = &snapshot_list[i];
4885
4886 if (entry->killed || entry->jse_killtime) {
4887 /*
4888 * We apparently raced on the exit path
4889 * for this process, as it's snapshot entry
4890 * has already recorded a kill.
4891 */
4892 assert(entry->killed && entry->jse_killtime);
4893 break;
4894 }
4895
4896 /*
4897 * Update the entry we just found in the snapshot.
4898 */
4899
4900 entry->killed = kill_cause;
4901 entry->jse_killtime = killtime;
4902 entry->jse_gencount = snapshot->js_gencount;
4903 entry->jse_idle_delta = p->p_memstat_idle_delta;
4904 #if CONFIG_FREEZE
4905 entry->jse_thaw_count = p->p_memstat_thaw_count;
4906 #else /* CONFIG_FREEZE */
4907 entry->jse_thaw_count = 0;
4908 #endif /* CONFIG_FREEZE */
4909
4910 /*
4911 * If a process has moved between bands since snapshot was
4912 * initialized, then likely these fields changed too.
4913 */
4914 if (entry->priority != p->p_memstat_effectivepriority) {
4915 strlcpy(entry->name, p->p_name, sizeof(entry->name));
4916 entry->priority = p->p_memstat_effectivepriority;
4917 entry->state = memorystatus_build_state(p);
4918 entry->user_data = p->p_memstat_userdata;
4919 entry->fds = p->p_fd->fd_nfiles;
4920 }
4921
4922 /*
4923 * Always update the page counts on a kill.
4924 */
4925
4926 uint32_t pages = 0;
4927 uint32_t max_pages_lifetime = 0;
4928 uint32_t purgeable_pages = 0;
4929
4930 memorystatus_get_task_page_counts(p->task, &pages, &max_pages_lifetime, &purgeable_pages);
4931 entry->pages = (uint64_t)pages;
4932 entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
4933 entry->purgeable_pages = (uint64_t)purgeable_pages;
4934
4935 uint64_t internal_pages = 0;
4936 uint64_t internal_compressed_pages = 0;
4937 uint64_t purgeable_nonvolatile_pages = 0;
4938 uint64_t purgeable_nonvolatile_compressed_pages = 0;
4939 uint64_t alternate_accounting_pages = 0;
4940 uint64_t alternate_accounting_compressed_pages = 0;
4941 uint64_t iokit_mapped_pages = 0;
4942 uint64_t page_table_pages = 0;
4943 uint64_t frozen_to_swap_pages = 0;
4944
4945 memorystatus_get_task_phys_footprint_page_counts(p->task, &internal_pages, &internal_compressed_pages,
4946 &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages,
4947 &alternate_accounting_pages, &alternate_accounting_compressed_pages,
4948 &iokit_mapped_pages, &page_table_pages, &frozen_to_swap_pages);
4949
4950 entry->jse_internal_pages = internal_pages;
4951 entry->jse_internal_compressed_pages = internal_compressed_pages;
4952 entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages;
4953 entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages;
4954 entry->jse_alternate_accounting_pages = alternate_accounting_pages;
4955 entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages;
4956 entry->jse_iokit_mapped_pages = iokit_mapped_pages;
4957 entry->jse_page_table_pages = page_table_pages;
4958 entry->jse_frozen_to_swap_pages = frozen_to_swap_pages;
4959
4960 uint64_t region_count = 0;
4961 memorystatus_get_task_memory_region_count(p->task, &region_count);
4962 entry->jse_memory_region_count = region_count;
4963
4964 goto exit;
4965 }
4966 }
4967
4968 if (entry == NULL) {
4969 /*
4970 * The entry was not found in the snapshot, so the process must have
4971 * launched after the snapshot was initialized.
4972 * Let's try to append the new entry.
4973 */
4974 if (memorystatus_jetsam_snapshot_count < memorystatus_jetsam_snapshot_max) {
4975 /*
4976 * A populated snapshot buffer exists
4977 * and there is room to init a new entry.
4978 */
4979 assert(memorystatus_jetsam_snapshot_count == snapshot->entry_count);
4980
4981 if (memorystatus_init_jetsam_snapshot_entry_with_kill_locked(snapshot, p, kill_cause, killtime, &entry)) {
4982 memorystatus_jetsam_snapshot_count++;
4983
4984 if (memorystatus_jetsam_snapshot_count >= memorystatus_jetsam_snapshot_max) {
4985 /*
4986 * We just used the last slot in the snapshot buffer.
4987 * We only want to log it once... so we do it here
4988 * when we notice we've hit the max.
4989 */
4990 printf("memorystatus: WARNING snapshot buffer is full, count %d\n",
4991 memorystatus_jetsam_snapshot_count);
4992 }
4993 }
4994 }
4995 }
4996
4997 exit:
4998 if (entry) {
4999 #if CONFIG_FREEZE
5000 if (memorystatus_jetsam_use_freezer_snapshot && isApp(p)) {
5001 /* This is an app kill. Record it in the freezer snapshot so dasd can incorporate this in its recommendations. */
5002 copied_to_freezer_snapshot = memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_freezer, memorystatus_jetsam_snapshot_freezer_max, entry);
5003 if (copied_to_freezer_snapshot && memorystatus_jetsam_snapshot_freezer->entry_count == memorystatus_jetsam_snapshot_freezer_max) {
5004 /*
5005 * We just used the last slot in the freezer snapshot buffer.
5006 * We only want to log it once... so we do it here
5007 * when we notice we've hit the max.
5008 */
5009 os_log_error(OS_LOG_DEFAULT, "memorystatus: WARNING freezer snapshot buffer is full, count %zu",
5010 memorystatus_jetsam_snapshot_freezer->entry_count);
5011 }
5012 }
5013 #endif /* CONFIG_FREEZE */
5014 } else {
5015 /*
5016 * If we reach here, the snapshot buffer could not be updated.
5017 * Most likely, the buffer is full, in which case we would have
5018 * logged a warning in the previous call.
5019 *
5020 * For now, we will stop appending snapshot entries.
5021 * When the buffer is consumed, the snapshot state will reset.
5022 */
5023
5024 MEMORYSTATUS_DEBUG(4, "memorystatus_update_jetsam_snapshot_entry_locked: failed to update pid %d, priority %d, count %d\n",
5025 p->p_pid, p->p_memstat_effectivepriority, memorystatus_jetsam_snapshot_count);
5026
5027 #if CONFIG_FREEZE
5028 /* We still attempt to record this in the freezer snapshot */
5029 if (memorystatus_jetsam_use_freezer_snapshot && isApp(p)) {
5030 snapshot = memorystatus_jetsam_snapshot_freezer;
5031 if (snapshot->entry_count < memorystatus_jetsam_snapshot_freezer_max) {
5032 copied_to_freezer_snapshot = memorystatus_init_jetsam_snapshot_entry_with_kill_locked(snapshot, p, kill_cause, killtime, &entry);
5033 if (copied_to_freezer_snapshot && memorystatus_jetsam_snapshot_freezer->entry_count == memorystatus_jetsam_snapshot_freezer_max) {
5034 /*
5035 * We just used the last slot in the freezer snapshot buffer.
5036 * We only want to log it once... so we do it here
5037 * when we notice we've hit the max.
5038 */
5039 os_log_error(OS_LOG_DEFAULT, "memorystatus: WARNING freezer snapshot buffer is full, count %zu",
5040 memorystatus_jetsam_snapshot_freezer->entry_count);
5041 }
5042 }
5043 }
5044 #endif /* CONFIG_FREEZE */
5045 }
5046
5047 return;
5048 }
5049
5050 #if CONFIG_JETSAM
5051 void
5052 memorystatus_pages_update(unsigned int pages_avail)
5053 {
5054 memorystatus_available_pages = pages_avail;
5055
5056 #if VM_PRESSURE_EVENTS
5057 /*
5058 * Since memorystatus_available_pages changes, we should
5059 * re-evaluate the pressure levels on the system and
5060 * check if we need to wake the pressure thread.
5061 * We also update memorystatus_level in that routine.
5062 */
5063 vm_pressure_response();
5064
5065 if (memorystatus_available_pages <= memorystatus_available_pages_pressure) {
5066 if (memorystatus_hwm_candidates || (memorystatus_available_pages <= memorystatus_available_pages_critical)) {
5067 memorystatus_thread_wake();
5068 }
5069 }
5070 #if CONFIG_FREEZE
5071 /*
5072 * We can't grab the freezer_mutex here even though that synchronization would be correct to inspect
5073 * the # of frozen processes and wakeup the freezer thread. Reason being that we come here into this
5074 * code with (possibly) the page-queue locks held and preemption disabled. So trying to grab a mutex here
5075 * will result in the "mutex with preemption disabled" panic.
5076 */
5077
5078 if (memorystatus_freeze_thread_should_run() == TRUE) {
5079 /*
5080 * The freezer thread is usually woken up by some user-space call i.e. pid_hibernate(any process).
5081 * That trigger isn't invoked often enough and so we are enabling this explicit wakeup here.
5082 */
5083 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
5084 thread_wakeup((event_t)&memorystatus_freeze_wakeup);
5085 }
5086 }
5087 #endif /* CONFIG_FREEZE */
5088
5089 #else /* VM_PRESSURE_EVENTS */
5090
5091 boolean_t critical, delta;
5092
5093 if (!memorystatus_delta) {
5094 return;
5095 }
5096
5097 critical = (pages_avail < memorystatus_available_pages_critical) ? TRUE : FALSE;
5098 delta = ((pages_avail >= (memorystatus_available_pages + memorystatus_delta))
5099 || (memorystatus_available_pages >= (pages_avail + memorystatus_delta))) ? TRUE : FALSE;
5100
5101 if (critical || delta) {
5102 unsigned int total_pages;
5103
5104 total_pages = (unsigned int) atop_64(max_mem);
5105 #if CONFIG_SECLUDED_MEMORY
5106 total_pages -= vm_page_secluded_count;
5107 #endif /* CONFIG_SECLUDED_MEMORY */
5108 memorystatus_level = memorystatus_available_pages * 100 / total_pages;
5109 memorystatus_thread_wake();
5110 }
5111 #endif /* VM_PRESSURE_EVENTS */
5112 }
5113 #endif /* CONFIG_JETSAM */
5114
5115 static boolean_t
5116 memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount)
5117 {
5118 clock_sec_t tv_sec;
5119 clock_usec_t tv_usec;
5120 uint32_t pages = 0;
5121 uint32_t max_pages_lifetime = 0;
5122 uint32_t purgeable_pages = 0;
5123 uint64_t internal_pages = 0;
5124 uint64_t internal_compressed_pages = 0;
5125 uint64_t purgeable_nonvolatile_pages = 0;
5126 uint64_t purgeable_nonvolatile_compressed_pages = 0;
5127 uint64_t alternate_accounting_pages = 0;
5128 uint64_t alternate_accounting_compressed_pages = 0;
5129 uint64_t iokit_mapped_pages = 0;
5130 uint64_t page_table_pages = 0;
5131 uint64_t frozen_to_swap_pages = 0;
5132 uint64_t region_count = 0;
5133 uint64_t cids[COALITION_NUM_TYPES];
5134
5135 memset(entry, 0, sizeof(memorystatus_jetsam_snapshot_entry_t));
5136
5137 entry->pid = p->p_pid;
5138 strlcpy(&entry->name[0], p->p_name, sizeof(entry->name));
5139 entry->priority = p->p_memstat_effectivepriority;
5140
5141 memorystatus_get_task_page_counts(p->task, &pages, &max_pages_lifetime, &purgeable_pages);
5142 entry->pages = (uint64_t)pages;
5143 entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
5144 entry->purgeable_pages = (uint64_t)purgeable_pages;
5145
5146 memorystatus_get_task_phys_footprint_page_counts(p->task, &internal_pages, &internal_compressed_pages,
5147 &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages,
5148 &alternate_accounting_pages, &alternate_accounting_compressed_pages,
5149 &iokit_mapped_pages, &page_table_pages, &frozen_to_swap_pages);
5150
5151 entry->jse_internal_pages = internal_pages;
5152 entry->jse_internal_compressed_pages = internal_compressed_pages;
5153 entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages;
5154 entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages;
5155 entry->jse_alternate_accounting_pages = alternate_accounting_pages;
5156 entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages;
5157 entry->jse_iokit_mapped_pages = iokit_mapped_pages;
5158 entry->jse_page_table_pages = page_table_pages;
5159 entry->jse_frozen_to_swap_pages = frozen_to_swap_pages;
5160
5161 memorystatus_get_task_memory_region_count(p->task, &region_count);
5162 entry->jse_memory_region_count = region_count;
5163
5164 entry->state = memorystatus_build_state(p);
5165 entry->user_data = p->p_memstat_userdata;
5166 memcpy(&entry->uuid[0], &p->p_uuid[0], sizeof(p->p_uuid));
5167 entry->fds = p->p_fd->fd_nfiles;
5168
5169 absolutetime_to_microtime(get_task_cpu_time(p->task), &tv_sec, &tv_usec);
5170 entry->cpu_time.tv_sec = (int64_t)tv_sec;
5171 entry->cpu_time.tv_usec = (int64_t)tv_usec;
5172
5173 assert(p->p_stats != NULL);
5174 entry->jse_starttime = p->p_stats->ps_start; /* abstime process started */
5175 entry->jse_killtime = 0; /* abstime jetsam chose to kill process */
5176 entry->killed = 0; /* the jetsam kill cause */
5177 entry->jse_gencount = gencount; /* indicates a pass through jetsam thread, when process was targeted to be killed */
5178
5179 entry->jse_idle_delta = p->p_memstat_idle_delta; /* Most recent timespan spent in idle-band */
5180
5181 #if CONFIG_FREEZE
5182 entry->jse_thaw_count = p->p_memstat_thaw_count;
5183 #else /* CONFIG_FREEZE */
5184 entry->jse_thaw_count = 0;
5185 #endif /* CONFIG_FREEZE */
5186
5187 proc_coalitionids(p, cids);
5188 entry->jse_coalition_jetsam_id = cids[COALITION_TYPE_JETSAM];
5189
5190 return TRUE;
5191 }
5192
5193 static void
5194 memorystatus_init_snapshot_vmstats(memorystatus_jetsam_snapshot_t *snapshot)
5195 {
5196 kern_return_t kr = KERN_SUCCESS;
5197 mach_msg_type_number_t count = HOST_VM_INFO64_COUNT;
5198 vm_statistics64_data_t vm_stat;
5199
5200 if ((kr = host_statistics64(host_self(), HOST_VM_INFO64, (host_info64_t)&vm_stat, &count)) != KERN_SUCCESS) {
5201 printf("memorystatus_init_jetsam_snapshot_stats: host_statistics64 failed with %d\n", kr);
5202 memset(&snapshot->stats, 0, sizeof(snapshot->stats));
5203 } else {
5204 snapshot->stats.free_pages = vm_stat.free_count;
5205 snapshot->stats.active_pages = vm_stat.active_count;
5206 snapshot->stats.inactive_pages = vm_stat.inactive_count;
5207 snapshot->stats.throttled_pages = vm_stat.throttled_count;
5208 snapshot->stats.purgeable_pages = vm_stat.purgeable_count;
5209 snapshot->stats.wired_pages = vm_stat.wire_count;
5210
5211 snapshot->stats.speculative_pages = vm_stat.speculative_count;
5212 snapshot->stats.filebacked_pages = vm_stat.external_page_count;
5213 snapshot->stats.anonymous_pages = vm_stat.internal_page_count;
5214 snapshot->stats.compressions = vm_stat.compressions;
5215 snapshot->stats.decompressions = vm_stat.decompressions;
5216 snapshot->stats.compressor_pages = vm_stat.compressor_page_count;
5217 snapshot->stats.total_uncompressed_pages_in_compressor = vm_stat.total_uncompressed_pages_in_compressor;
5218 }
5219
5220 get_zone_map_size(&snapshot->stats.zone_map_size, &snapshot->stats.zone_map_capacity);
5221
5222 bzero(snapshot->stats.largest_zone_name, sizeof(snapshot->stats.largest_zone_name));
5223 get_largest_zone_info(snapshot->stats.largest_zone_name, sizeof(snapshot->stats.largest_zone_name),
5224 &snapshot->stats.largest_zone_size);
5225 }
5226
5227 /*
5228 * Collect vm statistics at boot.
5229 * Called only once (see kern_exec.c)
5230 * Data can be consumed at any time.
5231 */
5232 void
5233 memorystatus_init_at_boot_snapshot()
5234 {
5235 memorystatus_init_snapshot_vmstats(&memorystatus_at_boot_snapshot);
5236 memorystatus_at_boot_snapshot.entry_count = 0;
5237 memorystatus_at_boot_snapshot.notification_time = 0; /* updated when consumed */
5238 memorystatus_at_boot_snapshot.snapshot_time = mach_absolute_time();
5239 }
5240
5241 static void
5242 memorystatus_init_jetsam_snapshot_header(memorystatus_jetsam_snapshot_t *snapshot)
5243 {
5244 memorystatus_init_snapshot_vmstats(snapshot);
5245 snapshot->snapshot_time = mach_absolute_time();
5246 snapshot->notification_time = 0;
5247 snapshot->js_gencount = 0;
5248 }
5249
5250 static void
5251 memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count )
5252 {
5253 proc_t p, next_p;
5254 unsigned int b = 0, i = 0;
5255
5256 memorystatus_jetsam_snapshot_t *snapshot = NULL;
5257 memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
5258 unsigned int snapshot_max = 0;
5259
5260 LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5261
5262 if (od_snapshot) {
5263 /*
5264 * This is an on_demand snapshot
5265 */
5266 snapshot = od_snapshot;
5267 snapshot_list = od_snapshot->entries;
5268 snapshot_max = ods_list_count;
5269 } else {
5270 /*
5271 * This is a jetsam event snapshot
5272 */
5273 snapshot = memorystatus_jetsam_snapshot;
5274 snapshot_list = memorystatus_jetsam_snapshot->entries;
5275 snapshot_max = memorystatus_jetsam_snapshot_max;
5276 }
5277
5278 memorystatus_init_jetsam_snapshot_header(snapshot);
5279
5280 next_p = memorystatus_get_first_proc_locked(&b, TRUE);
5281 while (next_p) {
5282 p = next_p;
5283 next_p = memorystatus_get_next_proc_locked(&b, p, TRUE);
5284
5285 if (FALSE == memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i], snapshot->js_gencount)) {
5286 continue;
5287 }
5288
5289 MEMORYSTATUS_DEBUG(0, "jetsam snapshot pid %d, uuid = %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
5290 p->p_pid,
5291 p->p_uuid[0], p->p_uuid[1], p->p_uuid[2], p->p_uuid[3], p->p_uuid[4], p->p_uuid[5], p->p_uuid[6], p->p_uuid[7],
5292 p->p_uuid[8], p->p_uuid[9], p->p_uuid[10], p->p_uuid[11], p->p_uuid[12], p->p_uuid[13], p->p_uuid[14], p->p_uuid[15]);
5293
5294 if (++i == snapshot_max) {
5295 break;
5296 }
5297 }
5298
5299 snapshot->entry_count = i;
5300
5301 if (!od_snapshot) {
5302 /* update the system buffer count */
5303 memorystatus_jetsam_snapshot_count = i;
5304 }
5305 }
5306
5307 #if DEVELOPMENT || DEBUG
5308
5309 #if CONFIG_JETSAM
5310 static int
5311 memorystatus_cmd_set_panic_bits(user_addr_t buffer, size_t buffer_size)
5312 {
5313 int ret;
5314 memorystatus_jetsam_panic_options_t debug;
5315
5316 if (buffer_size != sizeof(memorystatus_jetsam_panic_options_t)) {
5317 return EINVAL;
5318 }
5319
5320 ret = copyin(buffer, &debug, buffer_size);
5321 if (ret) {
5322 return ret;
5323 }
5324
5325 /* Panic bits match kMemorystatusKilled* enum */
5326 memorystatus_jetsam_panic_debug = (memorystatus_jetsam_panic_debug & ~debug.mask) | (debug.data & debug.mask);
5327
5328 /* Copyout new value */
5329 debug.data = memorystatus_jetsam_panic_debug;
5330 ret = copyout(&debug, buffer, sizeof(memorystatus_jetsam_panic_options_t));
5331
5332 return ret;
5333 }
5334 #endif /* CONFIG_JETSAM */
5335
5336 /*
5337 * Verify that the given bucket has been sorted correctly.
5338 *
5339 * Walks through the bucket and verifies that all pids in the
5340 * expected_order buffer are in that bucket and in the same
5341 * relative order.
5342 *
5343 * The proc_list_lock must be held by the caller.
5344 */
5345 static int
5346 memorystatus_verify_sort_order(unsigned int bucket_index, pid_t *expected_order, size_t num_pids)
5347 {
5348 LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5349
5350 int error = 0;
5351 proc_t p = NULL;
5352 size_t i = 0;
5353
5354 /*
5355 * NB: We allow other procs to be mixed in within the expected ones.
5356 * We just need the expected procs to be in the right order relative to each other.
5357 */
5358 p = memorystatus_get_first_proc_locked(&bucket_index, FALSE);
5359 while (p) {
5360 if (p->p_pid == expected_order[i]) {
5361 i++;
5362 }
5363 if (i == num_pids) {
5364 break;
5365 }
5366 p = memorystatus_get_next_proc_locked(&bucket_index, p, FALSE);
5367 }
5368 if (i != num_pids) {
5369 char buffer[128];
5370 size_t len = sizeof(buffer);
5371 size_t buffer_idx = 0;
5372 os_log_error(OS_LOG_DEFAULT, "memorystatus_verify_sort_order: Processes in bucket %d were not sorted properly\n", bucket_index);
5373 for (i = 0; i < num_pids; i++) {
5374 int num_written = snprintf(buffer + buffer_idx, len - buffer_idx, "%d,", expected_order[i]);
5375 if (num_written <= 0) {
5376 break;
5377 }
5378 if (buffer_idx + (unsigned int) num_written >= len) {
5379 break;
5380 }
5381 buffer_idx += num_written;
5382 }
5383 os_log_error(OS_LOG_DEFAULT, "memorystatus_verify_sort_order: Expected order [%s]", buffer);
5384 memset(buffer, 0, len);
5385 buffer_idx = 0;
5386 p = memorystatus_get_first_proc_locked(&bucket_index, FALSE);
5387 i = 0;
5388 os_log_error(OS_LOG_DEFAULT, "memorystatus_verify_sort_order: Actual order:");
5389 while (p) {
5390 int num_written;
5391 if (buffer_idx == 0) {
5392 num_written = snprintf(buffer + buffer_idx, len - buffer_idx, "%zu: %d,", i, p->p_pid);
5393 } else {
5394 num_written = snprintf(buffer + buffer_idx, len - buffer_idx, "%d,", p->p_pid);
5395 }
5396 if (num_written <= 0) {
5397 break;
5398 }
5399 buffer_idx += (unsigned int) num_written;
5400 assert(buffer_idx <= len);
5401 if (i % 10 == 0) {
5402 os_log_error(OS_LOG_DEFAULT, "memorystatus_verify_sort_order: %s", buffer);
5403 buffer_idx = 0;
5404 }
5405 p = memorystatus_get_next_proc_locked(&bucket_index, p, FALSE);
5406 i++;
5407 }
5408 if (buffer_idx != 0) {
5409 os_log_error(OS_LOG_DEFAULT, "memorystatus_verify_sort_order: %s", buffer);
5410 }
5411 error = EINVAL;
5412 }
5413 return error;
5414 }
5415
5416 /*
5417 * Triggers a sort_order on a specified jetsam priority band.
5418 * This is for testing only, used to force a path through the sort
5419 * function.
5420 */
5421 static int
5422 memorystatus_cmd_test_jetsam_sort(int priority,
5423 int sort_order,
5424 user_addr_t expected_order_user,
5425 size_t expected_order_user_len)
5426 {
5427 int error = 0;
5428 unsigned int bucket_index = 0;
5429 static size_t kMaxPids = 8;
5430 pid_t expected_order[kMaxPids];
5431 size_t copy_size = sizeof(expected_order);
5432 size_t num_pids;
5433
5434 if (expected_order_user_len < copy_size) {
5435 copy_size = expected_order_user_len;
5436 }
5437 num_pids = copy_size / sizeof(pid_t);
5438
5439 error = copyin(expected_order_user, expected_order, copy_size);
5440 if (error != 0) {
5441 return error;
5442 }
5443
5444 if (priority == -1) {
5445 /* Use as shorthand for default priority */
5446 bucket_index = JETSAM_PRIORITY_DEFAULT;
5447 } else {
5448 bucket_index = (unsigned int)priority;
5449 }
5450
5451 /*
5452 * Acquire lock before sorting so we can check the sort order
5453 * while still holding the lock.
5454 */
5455 proc_list_lock();
5456
5457 memorystatus_sort_bucket_locked(bucket_index, sort_order);
5458
5459 if (expected_order_user != CAST_USER_ADDR_T(NULL) && expected_order_user_len > 0) {
5460 error = memorystatus_verify_sort_order(bucket_index, expected_order, num_pids);
5461 }
5462
5463 proc_list_unlock();
5464
5465 return error;
5466 }
5467
5468 #endif /* DEVELOPMENT || DEBUG */
5469
5470 /*
5471 * Prepare the process to be killed (set state, update snapshot) and kill it.
5472 */
5473 static uint64_t memorystatus_purge_before_jetsam_success = 0;
5474
5475 static boolean_t
5476 memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, boolean_t *killed, uint64_t *footprint_of_killed_proc)
5477 {
5478 pid_t aPid = 0;
5479 uint32_t aPid_ep = 0;
5480
5481 uint64_t killtime = 0;
5482 clock_sec_t tv_sec;
5483 clock_usec_t tv_usec;
5484 uint32_t tv_msec;
5485 boolean_t retval = FALSE;
5486
5487 aPid = p->p_pid;
5488 aPid_ep = p->p_memstat_effectivepriority;
5489
5490 if (cause != kMemorystatusKilledVnodes && cause != kMemorystatusKilledZoneMapExhaustion) {
5491 /*
5492 * Genuine memory pressure and not other (vnode/zone) resource exhaustion.
5493 */
5494 boolean_t success = FALSE;
5495 uint64_t num_pages_purged;
5496 uint64_t num_pages_reclaimed = 0;
5497 uint64_t num_pages_unsecluded = 0;
5498
5499 networking_memstatus_callout(p, cause);
5500 num_pages_purged = vm_purgeable_purge_task_owned(p->task);
5501 num_pages_reclaimed += num_pages_purged;
5502 #if CONFIG_SECLUDED_MEMORY
5503 if (cause == kMemorystatusKilledVMPageShortage &&
5504 vm_page_secluded_count > 0 &&
5505 task_can_use_secluded_mem(p->task, FALSE)) {
5506 /*
5507 * We're about to kill a process that has access
5508 * to the secluded pool. Drain that pool into the
5509 * free or active queues to make these pages re-appear
5510 * as "available", which might make us no longer need
5511 * to kill that process.
5512 * Since the secluded pool does not get refilled while
5513 * a process has access to it, it should remain
5514 * drained.
5515 */
5516 num_pages_unsecluded = vm_page_secluded_drain();
5517 num_pages_reclaimed += num_pages_unsecluded;
5518 }
5519 #endif /* CONFIG_SECLUDED_MEMORY */
5520
5521 if (num_pages_reclaimed) {
5522 /*
5523 * We actually reclaimed something and so let's
5524 * check if we need to continue with the kill.
5525 */
5526 if (cause == kMemorystatusKilledHiwat) {
5527 uint64_t footprint_in_bytes = get_task_phys_footprint(p->task);
5528 uint64_t memlimit_in_bytes = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL); /* convert MB to bytes */
5529 success = (footprint_in_bytes <= memlimit_in_bytes);
5530 } else {
5531 success = (memorystatus_avail_pages_below_pressure() == FALSE);
5532 #if CONFIG_SECLUDED_MEMORY
5533 if (!success && num_pages_unsecluded) {
5534 /*
5535 * We just drained the secluded pool
5536 * because we're about to kill a
5537 * process that has access to it.
5538 * This is an important process and
5539 * we'd rather not kill it unless
5540 * absolutely necessary, so declare
5541 * success even if draining the pool
5542 * did not quite get us out of the
5543 * "pressure" level but still got
5544 * us out of the "critical" level.
5545 */
5546 success = (memorystatus_avail_pages_below_critical() == FALSE);
5547 }
5548 #endif /* CONFIG_SECLUDED_MEMORY */
5549 }
5550
5551 if (success) {
5552 memorystatus_purge_before_jetsam_success++;
5553
5554 os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: reclaimed %llu pages (%llu purged, %llu unsecluded) from pid %d [%s] and avoided %s\n",
5555 num_pages_reclaimed, num_pages_purged, num_pages_unsecluded, aPid, ((p && *p->p_name) ? p->p_name : "unknown"), memorystatus_kill_cause_name[cause]);
5556
5557 *killed = FALSE;
5558
5559 return TRUE;
5560 }
5561 }
5562 }
5563
5564 #if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
5565 MEMORYSTATUS_DEBUG(1, "jetsam: killing pid %d [%s] - %lld Mb > 1 (%d Mb)\n",
5566 aPid, (*p->p_name ? p->p_name : "unknown"),
5567 (footprint_in_bytes / (1024ULL * 1024ULL)), /* converted bytes to MB */
5568 p->p_memstat_memlimit);
5569 #endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
5570
5571 killtime = mach_absolute_time();
5572 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
5573 tv_msec = tv_usec / 1000;
5574
5575 proc_list_lock();
5576 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
5577 proc_list_unlock();
5578
5579 char kill_reason_string[128];
5580
5581 if (cause == kMemorystatusKilledHiwat) {
5582 strlcpy(kill_reason_string, "killing_highwater_process", 128);
5583 } else {
5584 if (aPid_ep == JETSAM_PRIORITY_IDLE) {
5585 strlcpy(kill_reason_string, "killing_idle_process", 128);
5586 } else {
5587 strlcpy(kill_reason_string, "killing_top_process", 128);
5588 }
5589 }
5590
5591 /*
5592 * memorystatus_do_kill drops a reference, so take another one so we can
5593 * continue to use this exit reason even after memorystatus_do_kill()
5594 * returns
5595 */
5596 os_reason_ref(jetsam_reason);
5597
5598 retval = memorystatus_do_kill(p, cause, jetsam_reason, footprint_of_killed_proc);
5599 *killed = retval;
5600
5601 os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: %s pid %d [%s] (%s %d) %lluKB - memorystatus_available_pages: %llu",
5602 (unsigned long)tv_sec, tv_msec, kill_reason_string,
5603 aPid, ((p && *p->p_name) ? p->p_name : "unknown"),
5604 memorystatus_kill_cause_name[cause], aPid_ep,
5605 (*footprint_of_killed_proc) >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
5606
5607 return retval;
5608 }
5609
5610 /*
5611 * Jetsam the first process in the queue.
5612 */
5613 static boolean_t
5614 memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, os_reason_t jetsam_reason,
5615 int32_t *priority, uint32_t *errors, uint64_t *memory_reclaimed)
5616 {
5617 pid_t aPid;
5618 proc_t p = PROC_NULL, next_p = PROC_NULL;
5619 boolean_t new_snapshot = FALSE, force_new_snapshot = FALSE, killed = FALSE, freed_mem = FALSE;
5620 unsigned int i = 0;
5621 uint32_t aPid_ep;
5622 int32_t local_max_kill_prio = JETSAM_PRIORITY_IDLE;
5623 uint64_t footprint_of_killed_proc = 0;
5624
5625 #ifndef CONFIG_FREEZE
5626 #pragma unused(any)
5627 #endif
5628
5629 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
5630 MEMORYSTATUS_LOG_AVAILABLE_PAGES, 0, 0, 0, 0);
5631
5632
5633 #if CONFIG_JETSAM
5634 if (sort_flag == TRUE) {
5635 (void)memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT);
5636 }
5637
5638 local_max_kill_prio = max_kill_priority;
5639
5640 force_new_snapshot = FALSE;
5641
5642 #else /* CONFIG_JETSAM */
5643
5644 if (sort_flag == TRUE) {
5645 (void)memorystatus_sort_bucket(JETSAM_PRIORITY_IDLE, JETSAM_SORT_DEFAULT);
5646 }
5647
5648 /*
5649 * On macos, we currently only have 2 reasons to be here:
5650 *
5651 * kMemorystatusKilledZoneMapExhaustion
5652 * AND
5653 * kMemorystatusKilledVMCompressorSpaceShortage
5654 *
5655 * If we are here because of kMemorystatusKilledZoneMapExhaustion, we will consider
5656 * any and all processes as eligible kill candidates since we need to avoid a panic.
5657 *
5658 * Since this function can be called async. it is harder to toggle the max_kill_priority
5659 * value before and after a call. And so we use this local variable to set the upper band
5660 * on the eligible kill bands.
5661 */
5662 if (cause == kMemorystatusKilledZoneMapExhaustion) {
5663 local_max_kill_prio = JETSAM_PRIORITY_MAX;
5664 } else {
5665 local_max_kill_prio = max_kill_priority;
5666 }
5667
5668 /*
5669 * And, because we are here under extreme circumstances, we force a snapshot even for
5670 * IDLE kills.
5671 */
5672 force_new_snapshot = TRUE;
5673
5674 #endif /* CONFIG_JETSAM */
5675
5676 if (cause != kMemorystatusKilledZoneMapExhaustion &&
5677 jetsam_current_thread() != NULL &&
5678 jetsam_current_thread()->limit_to_low_bands &&
5679 local_max_kill_prio > JETSAM_PRIORITY_BACKGROUND) {
5680 local_max_kill_prio = JETSAM_PRIORITY_BACKGROUND;
5681 }
5682
5683 proc_list_lock();
5684
5685 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5686 while (next_p && (next_p->p_memstat_effectivepriority <= local_max_kill_prio)) {
5687 p = next_p;
5688 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
5689
5690
5691 aPid = p->p_pid;
5692 aPid_ep = p->p_memstat_effectivepriority;
5693
5694 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
5695 continue; /* with lock held */
5696 }
5697
5698 if (cause == kMemorystatusKilledVnodes) {
5699 /*
5700 * If the system runs out of vnodes, we systematically jetsam
5701 * processes in hopes of stumbling onto a vnode gain that helps
5702 * the system recover. The process that happens to trigger
5703 * this path has no known relationship to the vnode shortage.
5704 * Deadlock avoidance: attempt to safeguard the caller.
5705 */
5706
5707 if (p == current_proc()) {
5708 /* do not jetsam the current process */
5709 continue;
5710 }
5711 }
5712
5713 #if CONFIG_FREEZE
5714 boolean_t skip;
5715 boolean_t reclaim_proc = !(p->p_memstat_state & P_MEMSTAT_LOCKED);
5716 if (any || reclaim_proc) {
5717 skip = FALSE;
5718 } else {
5719 skip = TRUE;
5720 }
5721
5722 if (skip) {
5723 continue;
5724 } else
5725 #endif
5726 {
5727 if (proc_ref_locked(p) == p) {
5728 /*
5729 * Mark as terminated so that if exit1() indicates success, but the process (for example)
5730 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
5731 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
5732 * acquisition of the proc lock.
5733 */
5734 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
5735 } else {
5736 /*
5737 * We need to restart the search again because
5738 * proc_ref_locked _can_ drop the proc_list lock
5739 * and we could have lost our stored next_p via
5740 * an exit() on another core.
5741 */
5742 i = 0;
5743 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5744 continue;
5745 }
5746
5747 /*
5748 * Capture a snapshot if none exists and:
5749 * - we are forcing a new snapshot creation, either because:
5750 * - on a particular platform we need these snapshots every time, OR
5751 * - a boot-arg/embedded device tree property has been set.
5752 * - priority was not requested (this is something other than an ambient kill)
5753 * - the priority was requested *and* the targeted process is not at idle priority
5754 */
5755 if ((memorystatus_jetsam_snapshot_count == 0) &&
5756 (force_new_snapshot || memorystatus_idle_snapshot || ((!priority) || (priority && (aPid_ep != JETSAM_PRIORITY_IDLE))))) {
5757 memorystatus_init_jetsam_snapshot_locked(NULL, 0);
5758 new_snapshot = TRUE;
5759 }
5760
5761 proc_list_unlock();
5762
5763 freed_mem = memorystatus_kill_proc(p, cause, jetsam_reason, &killed, &footprint_of_killed_proc); /* purged and/or killed 'p' */
5764 /* Success? */
5765 if (freed_mem) {
5766 if (killed) {
5767 *memory_reclaimed = footprint_of_killed_proc;
5768 if (priority) {
5769 *priority = aPid_ep;
5770 }
5771 } else {
5772 /* purged */
5773 proc_list_lock();
5774 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
5775 proc_list_unlock();
5776 }
5777 proc_rele(p);
5778 goto exit;
5779 }
5780
5781 /*
5782 * Failure - first unwind the state,
5783 * then fall through to restart the search.
5784 */
5785 proc_list_lock();
5786 proc_rele_locked(p);
5787 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
5788 p->p_memstat_state |= P_MEMSTAT_ERROR;
5789 *errors += 1;
5790
5791 i = 0;
5792 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5793 }
5794 }
5795
5796 proc_list_unlock();
5797
5798 exit:
5799 os_reason_free(jetsam_reason);
5800
5801 if (!killed) {
5802 *memory_reclaimed = 0;
5803
5804 /* Clear snapshot if freshly captured and no target was found */
5805 if (new_snapshot) {
5806 proc_list_lock();
5807 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
5808 proc_list_unlock();
5809 }
5810 }
5811
5812 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
5813 MEMORYSTATUS_LOG_AVAILABLE_PAGES, killed ? aPid : 0, killed, *memory_reclaimed, 0);
5814
5815 return killed;
5816 }
5817
5818 /*
5819 * Jetsam aggressively
5820 */
5821 static boolean_t
5822 memorystatus_kill_processes_aggressive(uint32_t cause, int aggr_count,
5823 int32_t priority_max, uint32_t *errors, uint64_t *memory_reclaimed)
5824 {
5825 pid_t aPid;
5826 proc_t p = PROC_NULL, next_p = PROC_NULL;
5827 boolean_t new_snapshot = FALSE, killed = FALSE;
5828 int kill_count = 0;
5829 unsigned int i = 0;
5830 int32_t aPid_ep = 0;
5831 unsigned int memorystatus_level_snapshot = 0;
5832 uint64_t killtime = 0;
5833 clock_sec_t tv_sec;
5834 clock_usec_t tv_usec;
5835 uint32_t tv_msec;
5836 os_reason_t jetsam_reason = OS_REASON_NULL;
5837 uint64_t footprint_of_killed_proc = 0;
5838
5839 *memory_reclaimed = 0;
5840
5841 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
5842 MEMORYSTATUS_LOG_AVAILABLE_PAGES, priority_max, 0, 0, 0);
5843
5844 if (priority_max >= JETSAM_PRIORITY_FOREGROUND) {
5845 /*
5846 * Check if aggressive jetsam has been asked to kill upto or beyond the
5847 * JETSAM_PRIORITY_FOREGROUND bucket. If yes, sort the FG band based on
5848 * coalition footprint.
5849 */
5850 memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT);
5851 }
5852
5853 jetsam_reason = os_reason_create(OS_REASON_JETSAM, cause);
5854 if (jetsam_reason == OS_REASON_NULL) {
5855 printf("memorystatus_kill_processes_aggressive: failed to allocate exit reason\n");
5856 }
5857
5858 proc_list_lock();
5859
5860 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5861 while (next_p) {
5862 if (((next_p->p_listflag & P_LIST_EXITED) != 0) ||
5863 ((unsigned int)(next_p->p_memstat_effectivepriority) != i)) {
5864 /*
5865 * We have raced with next_p running on another core.
5866 * It may be exiting or it may have moved to a different
5867 * jetsam priority band. This means we have lost our
5868 * place in line while traversing the jetsam list. We
5869 * attempt to recover by rewinding to the beginning of the band
5870 * we were already traversing. By doing this, we do not guarantee
5871 * that no process escapes this aggressive march, but we can make
5872 * skipping an entire range of processes less likely. (PR-21069019)
5873 */
5874
5875 MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: rewinding band %d, %s(%d) moved or exiting.\n",
5876 aggr_count, i, (*next_p->p_name ? next_p->p_name : "unknown"), next_p->p_pid);
5877
5878 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5879 continue;
5880 }
5881
5882 p = next_p;
5883 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
5884
5885 if (p->p_memstat_effectivepriority > priority_max) {
5886 /*
5887 * Bail out of this killing spree if we have
5888 * reached beyond the priority_max jetsam band.
5889 * That is, we kill up to and through the
5890 * priority_max jetsam band.
5891 */
5892 proc_list_unlock();
5893 goto exit;
5894 }
5895
5896 aPid = p->p_pid;
5897 aPid_ep = p->p_memstat_effectivepriority;
5898
5899 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
5900 continue;
5901 }
5902
5903 /*
5904 * Capture a snapshot if none exists.
5905 */
5906 if (memorystatus_jetsam_snapshot_count == 0) {
5907 memorystatus_init_jetsam_snapshot_locked(NULL, 0);
5908 new_snapshot = TRUE;
5909 }
5910
5911 /*
5912 * Mark as terminated so that if exit1() indicates success, but the process (for example)
5913 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
5914 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
5915 * acquisition of the proc lock.
5916 */
5917 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
5918
5919 killtime = mach_absolute_time();
5920 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
5921 tv_msec = tv_usec / 1000;
5922
5923 /* Shift queue, update stats */
5924 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
5925
5926 /*
5927 * In order to kill the target process, we will drop the proc_list_lock.
5928 * To guaranteee that p and next_p don't disappear out from under the lock,
5929 * we must take a ref on both.
5930 * If we cannot get a reference, then it's likely we've raced with
5931 * that process exiting on another core.
5932 */
5933 if (proc_ref_locked(p) == p) {
5934 if (next_p) {
5935 while (next_p && (proc_ref_locked(next_p) != next_p)) {
5936 proc_t temp_p;
5937
5938 /*
5939 * We must have raced with next_p exiting on another core.
5940 * Recover by getting the next eligible process in the band.
5941 */
5942
5943 MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: skipping %d [%s] (exiting?)\n",
5944 aggr_count, next_p->p_pid, (*next_p->p_name ? next_p->p_name : "(unknown)"));
5945
5946 temp_p = next_p;
5947 next_p = memorystatus_get_next_proc_locked(&i, temp_p, TRUE);
5948 }
5949 }
5950 proc_list_unlock();
5951
5952 printf("%lu.%03d memorystatus: %s%d pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n",
5953 (unsigned long)tv_sec, tv_msec,
5954 ((aPid_ep == JETSAM_PRIORITY_IDLE) ? "killing_idle_process_aggressive" : "killing_top_process_aggressive"),
5955 aggr_count, aPid, (*p->p_name ? p->p_name : "unknown"),
5956 memorystatus_kill_cause_name[cause], aPid_ep, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
5957
5958 memorystatus_level_snapshot = memorystatus_level;
5959
5960 /*
5961 * memorystatus_do_kill() drops a reference, so take another one so we can
5962 * continue to use this exit reason even after memorystatus_do_kill()
5963 * returns.
5964 */
5965 os_reason_ref(jetsam_reason);
5966 killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
5967
5968 /* Success? */
5969 if (killed) {
5970 *memory_reclaimed += footprint_of_killed_proc;
5971 proc_rele(p);
5972 kill_count++;
5973 p = NULL;
5974 killed = FALSE;
5975
5976 /*
5977 * Continue the killing spree.
5978 */
5979 proc_list_lock();
5980 if (next_p) {
5981 proc_rele_locked(next_p);
5982 }
5983
5984 if (aPid_ep == JETSAM_PRIORITY_FOREGROUND && memorystatus_aggressive_jetsam_lenient == TRUE) {
5985 if (memorystatus_level > memorystatus_level_snapshot && ((memorystatus_level - memorystatus_level_snapshot) >= AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD)) {
5986 #if DEVELOPMENT || DEBUG
5987 printf("Disabling Lenient mode after one-time deployment.\n");
5988 #endif /* DEVELOPMENT || DEBUG */
5989 memorystatus_aggressive_jetsam_lenient = FALSE;
5990 break;
5991 }
5992 }
5993
5994 continue;
5995 }
5996
5997 /*
5998 * Failure - first unwind the state,
5999 * then fall through to restart the search.
6000 */
6001 proc_list_lock();
6002 proc_rele_locked(p);
6003 if (next_p) {
6004 proc_rele_locked(next_p);
6005 }
6006 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6007 p->p_memstat_state |= P_MEMSTAT_ERROR;
6008 *errors += 1;
6009 p = NULL;
6010 }
6011
6012 /*
6013 * Failure - restart the search at the beginning of
6014 * the band we were already traversing.
6015 *
6016 * We might have raced with "p" exiting on another core, resulting in no
6017 * ref on "p". Or, we may have failed to kill "p".
6018 *
6019 * Either way, we fall thru to here, leaving the proc in the
6020 * P_MEMSTAT_TERMINATED or P_MEMSTAT_ERROR state.
6021 *
6022 * And, we hold the the proc_list_lock at this point.
6023 */
6024
6025 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6026 }
6027
6028 proc_list_unlock();
6029
6030 exit:
6031 os_reason_free(jetsam_reason);
6032
6033 /* Clear snapshot if freshly captured and no target was found */
6034 if (new_snapshot && (kill_count == 0)) {
6035 proc_list_lock();
6036 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
6037 proc_list_unlock();
6038 }
6039
6040 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
6041 MEMORYSTATUS_LOG_AVAILABLE_PAGES, 0, kill_count, *memory_reclaimed, 0);
6042
6043 if (kill_count > 0) {
6044 return TRUE;
6045 } else {
6046 return FALSE;
6047 }
6048 }
6049
6050 static boolean_t
6051 memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged, uint64_t *memory_reclaimed)
6052 {
6053 pid_t aPid = 0;
6054 proc_t p = PROC_NULL, next_p = PROC_NULL;
6055 boolean_t new_snapshot = FALSE, killed = FALSE, freed_mem = FALSE;
6056 unsigned int i = 0;
6057 uint32_t aPid_ep;
6058 os_reason_t jetsam_reason = OS_REASON_NULL;
6059 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_START,
6060 MEMORYSTATUS_LOG_AVAILABLE_PAGES, 0, 0, 0, 0);
6061
6062 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_HIGHWATER);
6063 if (jetsam_reason == OS_REASON_NULL) {
6064 printf("memorystatus_kill_hiwat_proc: failed to allocate exit reason\n");
6065 }
6066
6067 proc_list_lock();
6068
6069 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6070 while (next_p) {
6071 uint64_t footprint_in_bytes = 0;
6072 uint64_t memlimit_in_bytes = 0;
6073 boolean_t skip = 0;
6074
6075 p = next_p;
6076 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
6077
6078 aPid = p->p_pid;
6079 aPid_ep = p->p_memstat_effectivepriority;
6080
6081 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
6082 continue;
6083 }
6084
6085 /* skip if no limit set */
6086 if (p->p_memstat_memlimit <= 0) {
6087 continue;
6088 }
6089
6090 footprint_in_bytes = get_task_phys_footprint(p->task);
6091 memlimit_in_bytes = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL); /* convert MB to bytes */
6092 skip = (footprint_in_bytes <= memlimit_in_bytes);
6093
6094 #if CONFIG_FREEZE
6095 if (!skip) {
6096 if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
6097 skip = TRUE;
6098 } else {
6099 skip = FALSE;
6100 }
6101 }
6102 #endif
6103
6104 if (skip) {
6105 continue;
6106 } else {
6107 if (memorystatus_jetsam_snapshot_count == 0) {
6108 memorystatus_init_jetsam_snapshot_locked(NULL, 0);
6109 new_snapshot = TRUE;
6110 }
6111
6112 if (proc_ref_locked(p) == p) {
6113 /*
6114 * Mark as terminated so that if exit1() indicates success, but the process (for example)
6115 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
6116 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
6117 * acquisition of the proc lock.
6118 */
6119 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
6120
6121 proc_list_unlock();
6122 } else {
6123 /*
6124 * We need to restart the search again because
6125 * proc_ref_locked _can_ drop the proc_list lock
6126 * and we could have lost our stored next_p via
6127 * an exit() on another core.
6128 */
6129 i = 0;
6130 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6131 continue;
6132 }
6133
6134 footprint_in_bytes = 0;
6135 freed_mem = memorystatus_kill_proc(p, kMemorystatusKilledHiwat, jetsam_reason, &killed, &footprint_in_bytes); /* purged and/or killed 'p' */
6136
6137 /* Success? */
6138 if (freed_mem) {
6139 if (killed == FALSE) {
6140 /* purged 'p'..don't reset HWM candidate count */
6141 *purged = TRUE;
6142
6143 proc_list_lock();
6144 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6145 proc_list_unlock();
6146 } else {
6147 *memory_reclaimed = footprint_in_bytes;
6148 }
6149 proc_rele(p);
6150 goto exit;
6151 }
6152 /*
6153 * Failure - first unwind the state,
6154 * then fall through to restart the search.
6155 */
6156 proc_list_lock();
6157 proc_rele_locked(p);
6158 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6159 p->p_memstat_state |= P_MEMSTAT_ERROR;
6160 *errors += 1;
6161
6162 i = 0;
6163 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6164 }
6165 }
6166
6167 proc_list_unlock();
6168
6169 exit:
6170 os_reason_free(jetsam_reason);
6171
6172 if (!killed) {
6173 *memory_reclaimed = 0;
6174
6175 /* Clear snapshot if freshly captured and no target was found */
6176 if (new_snapshot) {
6177 proc_list_lock();
6178 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
6179 proc_list_unlock();
6180 }
6181 }
6182
6183 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_END,
6184 MEMORYSTATUS_LOG_AVAILABLE_PAGES, killed ? aPid : 0, killed, *memory_reclaimed, 0);
6185
6186 return killed;
6187 }
6188
6189 /*
6190 * Jetsam a process pinned in the elevated band.
6191 *
6192 * Return: true -- a pinned process was jetsammed
6193 * false -- no pinned process was jetsammed
6194 */
6195 boolean_t
6196 memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count, uint32_t *errors, uint64_t *memory_reclaimed)
6197 {
6198 pid_t aPid = 0;
6199 proc_t p = PROC_NULL, next_p = PROC_NULL;
6200 boolean_t new_snapshot = FALSE, killed = FALSE;
6201 int kill_count = 0;
6202 uint32_t aPid_ep;
6203 uint64_t killtime = 0;
6204 clock_sec_t tv_sec;
6205 clock_usec_t tv_usec;
6206 uint32_t tv_msec;
6207 uint64_t footprint_of_killed_proc = 0;
6208
6209
6210 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
6211 MEMORYSTATUS_LOG_AVAILABLE_PAGES, 0, 0, 0, 0);
6212
6213 #if CONFIG_FREEZE
6214 boolean_t consider_frozen_only = FALSE;
6215
6216 if (band == (unsigned int) memorystatus_freeze_jetsam_band) {
6217 consider_frozen_only = TRUE;
6218 }
6219 #endif /* CONFIG_FREEZE */
6220
6221 proc_list_lock();
6222
6223 next_p = memorystatus_get_first_proc_locked(&band, FALSE);
6224 while (next_p) {
6225 p = next_p;
6226 next_p = memorystatus_get_next_proc_locked(&band, p, FALSE);
6227
6228 aPid = p->p_pid;
6229 aPid_ep = p->p_memstat_effectivepriority;
6230
6231 /*
6232 * Only pick a process pinned in this elevated band
6233 */
6234 if (!(p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) {
6235 continue;
6236 }
6237
6238 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
6239 continue;
6240 }
6241
6242 #if CONFIG_FREEZE
6243 if (consider_frozen_only && !(p->p_memstat_state & P_MEMSTAT_FROZEN)) {
6244 continue;
6245 }
6246
6247 if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
6248 continue;
6249 }
6250 #endif /* CONFIG_FREEZE */
6251
6252 #if DEVELOPMENT || DEBUG
6253 MEMORYSTATUS_DEBUG(1, "jetsam: elevated%d process pid %d [%s] - memorystatus_available_pages: %d\n",
6254 aggr_count,
6255 aPid, (*p->p_name ? p->p_name : "unknown"),
6256 MEMORYSTATUS_LOG_AVAILABLE_PAGES);
6257 #endif /* DEVELOPMENT || DEBUG */
6258
6259 if (memorystatus_jetsam_snapshot_count == 0) {
6260 memorystatus_init_jetsam_snapshot_locked(NULL, 0);
6261 new_snapshot = TRUE;
6262 }
6263
6264 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
6265
6266 killtime = mach_absolute_time();
6267 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
6268 tv_msec = tv_usec / 1000;
6269
6270 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
6271
6272 if (proc_ref_locked(p) == p) {
6273 proc_list_unlock();
6274
6275 /*
6276 * memorystatus_do_kill drops a reference, so take another one so we can
6277 * continue to use this exit reason even after memorystatus_do_kill()
6278 * returns
6279 */
6280 os_reason_ref(jetsam_reason);
6281 killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
6282
6283 os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: killing_top_process_elevated%d pid %d [%s] (%s %d) %lluKB - memorystatus_available_pages: %llu\n",
6284 (unsigned long)tv_sec, tv_msec,
6285 aggr_count,
6286 aPid, ((p && *p->p_name) ? p->p_name : "unknown"),
6287 memorystatus_kill_cause_name[cause], aPid_ep,
6288 footprint_of_killed_proc >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
6289
6290 /* Success? */
6291 if (killed) {
6292 *memory_reclaimed = footprint_of_killed_proc;
6293 proc_rele(p);
6294 kill_count++;
6295 goto exit;
6296 }
6297
6298 /*
6299 * Failure - first unwind the state,
6300 * then fall through to restart the search.
6301 */
6302 proc_list_lock();
6303 proc_rele_locked(p);
6304 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6305 p->p_memstat_state |= P_MEMSTAT_ERROR;
6306 *errors += 1;
6307 }
6308
6309 /*
6310 * Failure - restart the search.
6311 *
6312 * We might have raced with "p" exiting on another core, resulting in no
6313 * ref on "p". Or, we may have failed to kill "p".
6314 *
6315 * Either way, we fall thru to here, leaving the proc in the
6316 * P_MEMSTAT_TERMINATED state or P_MEMSTAT_ERROR state.
6317 *
6318 * And, we hold the the proc_list_lock at this point.
6319 */
6320
6321 next_p = memorystatus_get_first_proc_locked(&band, FALSE);
6322 }
6323
6324 proc_list_unlock();
6325
6326 exit:
6327 os_reason_free(jetsam_reason);
6328
6329 if (kill_count == 0) {
6330 *memory_reclaimed = 0;
6331
6332 /* Clear snapshot if freshly captured and no target was found */
6333 if (new_snapshot) {
6334 proc_list_lock();
6335 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
6336 proc_list_unlock();
6337 }
6338 }
6339
6340 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
6341 MEMORYSTATUS_LOG_AVAILABLE_PAGES, killed ? aPid : 0, kill_count, *memory_reclaimed, 0);
6342
6343 return killed;
6344 }
6345
6346 static boolean_t
6347 memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause)
6348 {
6349 /*
6350 * TODO: allow a general async path
6351 *
6352 * NOTE: If a new async kill cause is added, make sure to update memorystatus_thread() to
6353 * add the appropriate exit reason code mapping.
6354 */
6355 if ((victim_pid != -1) ||
6356 (cause != kMemorystatusKilledVMPageShortage &&
6357 cause != kMemorystatusKilledVMCompressorThrashing &&
6358 cause != kMemorystatusKilledVMCompressorSpaceShortage &&
6359 cause != kMemorystatusKilledFCThrashing &&
6360 cause != kMemorystatusKilledZoneMapExhaustion)) {
6361 return FALSE;
6362 }
6363
6364 kill_under_pressure_cause = cause;
6365 memorystatus_thread_wake();
6366 return TRUE;
6367 }
6368
6369 boolean_t
6370 memorystatus_kill_on_VM_compressor_space_shortage(boolean_t async)
6371 {
6372 if (async) {
6373 return memorystatus_kill_process_async(-1, kMemorystatusKilledVMCompressorSpaceShortage);
6374 } else {
6375 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE);
6376 if (jetsam_reason == OS_REASON_NULL) {
6377 printf("memorystatus_kill_on_VM_compressor_space_shortage -- sync: failed to allocate jetsam reason\n");
6378 }
6379
6380 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMCompressorSpaceShortage, jetsam_reason);
6381 }
6382 }
6383
6384 #if CONFIG_JETSAM
6385 boolean_t
6386 memorystatus_kill_on_VM_compressor_thrashing(boolean_t async)
6387 {
6388 if (async) {
6389 return memorystatus_kill_process_async(-1, kMemorystatusKilledVMCompressorThrashing);
6390 } else {
6391 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING);
6392 if (jetsam_reason == OS_REASON_NULL) {
6393 printf("memorystatus_kill_on_VM_compressor_thrashing -- sync: failed to allocate jetsam reason\n");
6394 }
6395
6396 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMCompressorThrashing, jetsam_reason);
6397 }
6398 }
6399
6400 boolean_t
6401 memorystatus_kill_on_VM_page_shortage(boolean_t async)
6402 {
6403 if (async) {
6404 return memorystatus_kill_process_async(-1, kMemorystatusKilledVMPageShortage);
6405 } else {
6406 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMPAGESHORTAGE);
6407 if (jetsam_reason == OS_REASON_NULL) {
6408 printf("memorystatus_kill_on_VM_page_shortage -- sync: failed to allocate jetsam reason\n");
6409 }
6410
6411 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMPageShortage, jetsam_reason);
6412 }
6413 }
6414
6415 boolean_t
6416 memorystatus_kill_on_FC_thrashing(boolean_t async)
6417 {
6418 if (async) {
6419 return memorystatus_kill_process_async(-1, kMemorystatusKilledFCThrashing);
6420 } else {
6421 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_FCTHRASHING);
6422 if (jetsam_reason == OS_REASON_NULL) {
6423 printf("memorystatus_kill_on_FC_thrashing -- sync: failed to allocate jetsam reason\n");
6424 }
6425
6426 return memorystatus_kill_process_sync(-1, kMemorystatusKilledFCThrashing, jetsam_reason);
6427 }
6428 }
6429
6430 boolean_t
6431 memorystatus_kill_on_vnode_limit(void)
6432 {
6433 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_VNODE);
6434 if (jetsam_reason == OS_REASON_NULL) {
6435 printf("memorystatus_kill_on_vnode_limit: failed to allocate jetsam reason\n");
6436 }
6437
6438 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVnodes, jetsam_reason);
6439 }
6440
6441 #endif /* CONFIG_JETSAM */
6442
6443 boolean_t
6444 memorystatus_kill_on_zone_map_exhaustion(pid_t pid)
6445 {
6446 boolean_t res = FALSE;
6447 if (pid == -1) {
6448 res = memorystatus_kill_process_async(-1, kMemorystatusKilledZoneMapExhaustion);
6449 } else {
6450 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_ZONE_MAP_EXHAUSTION);
6451 if (jetsam_reason == OS_REASON_NULL) {
6452 printf("memorystatus_kill_on_zone_map_exhaustion: failed to allocate jetsam reason\n");
6453 }
6454
6455 res = memorystatus_kill_process_sync(pid, kMemorystatusKilledZoneMapExhaustion, jetsam_reason);
6456 }
6457 return res;
6458 }
6459
6460 void
6461 memorystatus_on_pageout_scan_end(void)
6462 {
6463 /* No-op */
6464 }
6465
6466 /* Return both allocated and actual size, since there's a race between allocation and list compilation */
6467 static int
6468 memorystatus_get_priority_list(memorystatus_priority_entry_t **list_ptr, size_t *buffer_size, size_t *list_size, boolean_t size_only)
6469 {
6470 uint32_t list_count, i = 0;
6471 memorystatus_priority_entry_t *list_entry;
6472 proc_t p;
6473
6474 list_count = memorystatus_list_count;
6475 *list_size = sizeof(memorystatus_priority_entry_t) * list_count;
6476
6477 /* Just a size check? */
6478 if (size_only) {
6479 return 0;
6480 }
6481
6482 /* Otherwise, validate the size of the buffer */
6483 if (*buffer_size < *list_size) {
6484 return EINVAL;
6485 }
6486
6487 *list_ptr = kheap_alloc(KHEAP_TEMP, *list_size, Z_WAITOK | Z_ZERO);
6488 if (!*list_ptr) {
6489 return ENOMEM;
6490 }
6491
6492 *buffer_size = *list_size;
6493 *list_size = 0;
6494
6495 list_entry = *list_ptr;
6496
6497 proc_list_lock();
6498
6499 p = memorystatus_get_first_proc_locked(&i, TRUE);
6500 while (p && (*list_size < *buffer_size)) {
6501 list_entry->pid = p->p_pid;
6502 list_entry->priority = p->p_memstat_effectivepriority;
6503 list_entry->user_data = p->p_memstat_userdata;
6504
6505 if (p->p_memstat_memlimit <= 0) {
6506 task_get_phys_footprint_limit(p->task, &list_entry->limit);
6507 } else {
6508 list_entry->limit = p->p_memstat_memlimit;
6509 }
6510
6511 list_entry->state = memorystatus_build_state(p);
6512 list_entry++;
6513
6514 *list_size += sizeof(memorystatus_priority_entry_t);
6515
6516 p = memorystatus_get_next_proc_locked(&i, p, TRUE);
6517 }
6518
6519 proc_list_unlock();
6520
6521 MEMORYSTATUS_DEBUG(1, "memorystatus_get_priority_list: returning %lu for size\n", (unsigned long)*list_size);
6522
6523 return 0;
6524 }
6525
6526 static int
6527 memorystatus_get_priority_pid(pid_t pid, user_addr_t buffer, size_t buffer_size)
6528 {
6529 int error = 0;
6530 memorystatus_priority_entry_t mp_entry;
6531 kern_return_t ret;
6532
6533 /* Validate inputs */
6534 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_priority_entry_t))) {
6535 return EINVAL;
6536 }
6537
6538 proc_t p = proc_find(pid);
6539 if (!p) {
6540 return ESRCH;
6541 }
6542
6543 memset(&mp_entry, 0, sizeof(memorystatus_priority_entry_t));
6544
6545 mp_entry.pid = p->p_pid;
6546 mp_entry.priority = p->p_memstat_effectivepriority;
6547 mp_entry.user_data = p->p_memstat_userdata;
6548 if (p->p_memstat_memlimit <= 0) {
6549 ret = task_get_phys_footprint_limit(p->task, &mp_entry.limit);
6550 if (ret != KERN_SUCCESS) {
6551 proc_rele(p);
6552 return EINVAL;
6553 }
6554 } else {
6555 mp_entry.limit = p->p_memstat_memlimit;
6556 }
6557 mp_entry.state = memorystatus_build_state(p);
6558
6559 proc_rele(p);
6560
6561 error = copyout(&mp_entry, buffer, buffer_size);
6562
6563 return error;
6564 }
6565
6566 static int
6567 memorystatus_cmd_get_priority_list(pid_t pid, user_addr_t buffer, size_t buffer_size, int32_t *retval)
6568 {
6569 int error = 0;
6570 boolean_t size_only;
6571 size_t list_size;
6572
6573 /*
6574 * When a non-zero pid is provided, the 'list' has only one entry.
6575 */
6576
6577 size_only = ((buffer == USER_ADDR_NULL) ? TRUE: FALSE);
6578
6579 if (pid != 0) {
6580 list_size = sizeof(memorystatus_priority_entry_t) * 1;
6581 if (!size_only) {
6582 error = memorystatus_get_priority_pid(pid, buffer, buffer_size);
6583 }
6584 } else {
6585 memorystatus_priority_entry_t *list = NULL;
6586 error = memorystatus_get_priority_list(&list, &buffer_size, &list_size, size_only);
6587
6588 if (error == 0) {
6589 if (!size_only) {
6590 error = copyout(list, buffer, list_size);
6591 }
6592 }
6593
6594 if (list) {
6595 kheap_free(KHEAP_TEMP, list, buffer_size);
6596 }
6597 }
6598
6599 if (error == 0) {
6600 assert(list_size <= INT32_MAX);
6601 *retval = (int32_t) list_size;
6602 }
6603
6604 return error;
6605 }
6606
6607 static void
6608 memorystatus_clear_errors(void)
6609 {
6610 proc_t p;
6611 unsigned int i = 0;
6612
6613 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_START, 0, 0, 0, 0, 0);
6614
6615 proc_list_lock();
6616
6617 p = memorystatus_get_first_proc_locked(&i, TRUE);
6618 while (p) {
6619 if (p->p_memstat_state & P_MEMSTAT_ERROR) {
6620 p->p_memstat_state &= ~P_MEMSTAT_ERROR;
6621 }
6622 p = memorystatus_get_next_proc_locked(&i, p, TRUE);
6623 }
6624
6625 proc_list_unlock();
6626
6627 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_END, 0, 0, 0, 0, 0);
6628 }
6629
6630 #if CONFIG_JETSAM
6631 static void
6632 memorystatus_update_levels_locked(boolean_t critical_only)
6633 {
6634 memorystatus_available_pages_critical = memorystatus_available_pages_critical_base;
6635
6636 /*
6637 * If there's an entry in the first bucket, we have idle processes.
6638 */
6639
6640 memstat_bucket_t *first_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
6641 if (first_bucket->count) {
6642 memorystatus_available_pages_critical += memorystatus_available_pages_critical_idle_offset;
6643
6644 if (memorystatus_available_pages_critical > memorystatus_available_pages_pressure) {
6645 /*
6646 * The critical threshold must never exceed the pressure threshold
6647 */
6648 memorystatus_available_pages_critical = memorystatus_available_pages_pressure;
6649 }
6650 }
6651
6652 if (memorystatus_jetsam_policy & kPolicyMoreFree) {
6653 memorystatus_available_pages_critical += memorystatus_policy_more_free_offset_pages;
6654 }
6655
6656 if (critical_only) {
6657 return;
6658 }
6659
6660 #if VM_PRESSURE_EVENTS
6661 memorystatus_available_pages_pressure = (int32_t)(pressure_threshold_percentage * (atop_64(max_mem) / 100));
6662 #endif
6663 }
6664
6665 void
6666 memorystatus_fast_jetsam_override(boolean_t enable_override)
6667 {
6668 /* If fast jetsam is not enabled, simply return */
6669 if (!fast_jetsam_enabled) {
6670 return;
6671 }
6672
6673 if (enable_override) {
6674 if ((memorystatus_jetsam_policy & kPolicyMoreFree) == kPolicyMoreFree) {
6675 return;
6676 }
6677 proc_list_lock();
6678 memorystatus_jetsam_policy |= kPolicyMoreFree;
6679 memorystatus_thread_pool_max();
6680 memorystatus_update_levels_locked(TRUE);
6681 proc_list_unlock();
6682 } else {
6683 if ((memorystatus_jetsam_policy & kPolicyMoreFree) == 0) {
6684 return;
6685 }
6686 proc_list_lock();
6687 memorystatus_jetsam_policy &= ~kPolicyMoreFree;
6688 memorystatus_thread_pool_default();
6689 memorystatus_update_levels_locked(TRUE);
6690 proc_list_unlock();
6691 }
6692 }
6693
6694
6695 static int
6696 sysctl_kern_memorystatus_policy_more_free SYSCTL_HANDLER_ARGS
6697 {
6698 #pragma unused(arg1, arg2, oidp)
6699 int error = 0, more_free = 0;
6700
6701 /*
6702 * TODO: Enable this privilege check?
6703 *
6704 * error = priv_check_cred(kauth_cred_get(), PRIV_VM_JETSAM, 0);
6705 * if (error)
6706 * return (error);
6707 */
6708
6709 error = sysctl_handle_int(oidp, &more_free, 0, req);
6710 if (error || !req->newptr) {
6711 return error;
6712 }
6713
6714 if (more_free) {
6715 memorystatus_fast_jetsam_override(true);
6716 } else {
6717 memorystatus_fast_jetsam_override(false);
6718 }
6719
6720 return 0;
6721 }
6722 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_policy_more_free, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
6723 0, 0, &sysctl_kern_memorystatus_policy_more_free, "I", "");
6724
6725 #endif /* CONFIG_JETSAM */
6726
6727 /*
6728 * Get the at_boot snapshot
6729 */
6730 static int
6731 memorystatus_get_at_boot_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
6732 {
6733 size_t input_size = *snapshot_size;
6734
6735 /*
6736 * The at_boot snapshot has no entry list.
6737 */
6738 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t);
6739
6740 if (size_only) {
6741 return 0;
6742 }
6743
6744 /*
6745 * Validate the size of the snapshot buffer
6746 */
6747 if (input_size < *snapshot_size) {
6748 return EINVAL;
6749 }
6750
6751 /*
6752 * Update the notification_time only
6753 */
6754 memorystatus_at_boot_snapshot.notification_time = mach_absolute_time();
6755 *snapshot = &memorystatus_at_boot_snapshot;
6756
6757 MEMORYSTATUS_DEBUG(7, "memorystatus_get_at_boot_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%d)\n",
6758 (long)input_size, (long)*snapshot_size, 0);
6759 return 0;
6760 }
6761
6762 /*
6763 * Get the previous fully populated snapshot
6764 */
6765 static int
6766 memorystatus_get_jetsam_snapshot_copy(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
6767 {
6768 size_t input_size = *snapshot_size;
6769
6770 if (memorystatus_jetsam_snapshot_copy_count > 0) {
6771 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_copy_count));
6772 } else {
6773 *snapshot_size = 0;
6774 }
6775
6776 if (size_only) {
6777 return 0;
6778 }
6779
6780 if (input_size < *snapshot_size) {
6781 return EINVAL;
6782 }
6783
6784 *snapshot = memorystatus_jetsam_snapshot_copy;
6785
6786 MEMORYSTATUS_DEBUG(7, "memorystatus_get_jetsam_snapshot_copy: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
6787 (long)input_size, (long)*snapshot_size, (long)memorystatus_jetsam_snapshot_copy_count);
6788
6789 return 0;
6790 }
6791
6792 #if CONFIG_FREEZE
6793 static int
6794 memorystatus_get_jetsam_snapshot_freezer(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
6795 {
6796 size_t input_size = *snapshot_size;
6797
6798 if (memorystatus_jetsam_snapshot_freezer->entry_count > 0) {
6799 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_freezer->entry_count));
6800 } else {
6801 *snapshot_size = 0;
6802 }
6803 assert(*snapshot_size <= memorystatus_jetsam_snapshot_freezer_size);
6804
6805 if (size_only) {
6806 return 0;
6807 }
6808
6809 if (input_size < *snapshot_size) {
6810 return EINVAL;
6811 }
6812
6813 *snapshot = memorystatus_jetsam_snapshot_freezer;
6814
6815 MEMORYSTATUS_DEBUG(7, "memorystatus_get_jetsam_snapshot_freezer: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
6816 (long)input_size, (long)*snapshot_size, (long)memorystatus_jetsam_snapshot_freezer->entry_count);
6817
6818 return 0;
6819 }
6820 #endif /* CONFIG_FREEZE */
6821
6822 static int
6823 memorystatus_get_on_demand_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
6824 {
6825 size_t input_size = *snapshot_size;
6826 uint32_t ods_list_count = memorystatus_list_count;
6827 memorystatus_jetsam_snapshot_t *ods = NULL; /* The on_demand snapshot buffer */
6828
6829 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (ods_list_count));
6830
6831 if (size_only) {
6832 return 0;
6833 }
6834
6835 /*
6836 * Validate the size of the snapshot buffer.
6837 * This is inherently racey. May want to revisit
6838 * this error condition and trim the output when
6839 * it doesn't fit.
6840 */
6841 if (input_size < *snapshot_size) {
6842 return EINVAL;
6843 }
6844
6845 /*
6846 * Allocate and initialize a snapshot buffer.
6847 */
6848 ods = kalloc(*snapshot_size);
6849 if (!ods) {
6850 return ENOMEM;
6851 }
6852
6853 memset(ods, 0, *snapshot_size);
6854
6855 proc_list_lock();
6856 memorystatus_init_jetsam_snapshot_locked(ods, ods_list_count);
6857 proc_list_unlock();
6858
6859 /*
6860 * Return the kernel allocated, on_demand buffer.
6861 * The caller of this routine will copy the data out
6862 * to user space and then free the kernel allocated
6863 * buffer.
6864 */
6865 *snapshot = ods;
6866
6867 MEMORYSTATUS_DEBUG(7, "memorystatus_get_on_demand_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
6868 (long)input_size, (long)*snapshot_size, (long)ods_list_count);
6869
6870 return 0;
6871 }
6872
6873 static int
6874 memorystatus_get_jetsam_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
6875 {
6876 size_t input_size = *snapshot_size;
6877
6878 if (memorystatus_jetsam_snapshot_count > 0) {
6879 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count));
6880 } else {
6881 *snapshot_size = 0;
6882 }
6883
6884 if (size_only) {
6885 return 0;
6886 }
6887
6888 if (input_size < *snapshot_size) {
6889 return EINVAL;
6890 }
6891
6892 *snapshot = memorystatus_jetsam_snapshot;
6893
6894 MEMORYSTATUS_DEBUG(7, "memorystatus_get_jetsam_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
6895 (long)input_size, (long)*snapshot_size, (long)memorystatus_jetsam_snapshot_count);
6896
6897 return 0;
6898 }
6899
6900
6901 static int
6902 memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t buffer_size, int32_t *retval)
6903 {
6904 int error = EINVAL;
6905 boolean_t size_only;
6906 boolean_t is_default_snapshot = FALSE;
6907 boolean_t is_on_demand_snapshot = FALSE;
6908 boolean_t is_at_boot_snapshot = FALSE;
6909 #if CONFIG_FREEZE
6910 bool is_freezer_snapshot = false;
6911 #endif /* CONFIG_FREEZE */
6912 memorystatus_jetsam_snapshot_t *snapshot;
6913
6914 size_only = ((buffer == USER_ADDR_NULL) ? TRUE : FALSE);
6915
6916 if (flags == 0) {
6917 /* Default */
6918 is_default_snapshot = TRUE;
6919 error = memorystatus_get_jetsam_snapshot(&snapshot, &buffer_size, size_only);
6920 } else {
6921 if (flags & ~(MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT | MEMORYSTATUS_SNAPSHOT_COPY | MEMORYSTATUS_FLAGS_SNAPSHOT_FREEZER)) {
6922 /*
6923 * Unsupported bit set in flag.
6924 */
6925 return EINVAL;
6926 }
6927
6928 if (flags & (flags - 0x1)) {
6929 /*
6930 * Can't have multiple flags set at the same time.
6931 */
6932 return EINVAL;
6933 }
6934
6935 if (flags & MEMORYSTATUS_SNAPSHOT_ON_DEMAND) {
6936 is_on_demand_snapshot = TRUE;
6937 /*
6938 * When not requesting the size only, the following call will allocate
6939 * an on_demand snapshot buffer, which is freed below.
6940 */
6941 error = memorystatus_get_on_demand_snapshot(&snapshot, &buffer_size, size_only);
6942 } else if (flags & MEMORYSTATUS_SNAPSHOT_AT_BOOT) {
6943 is_at_boot_snapshot = TRUE;
6944 error = memorystatus_get_at_boot_snapshot(&snapshot, &buffer_size, size_only);
6945 } else if (flags & MEMORYSTATUS_SNAPSHOT_COPY) {
6946 error = memorystatus_get_jetsam_snapshot_copy(&snapshot, &buffer_size, size_only);
6947 #if CONFIG_FREEZE
6948 } else if (flags & MEMORYSTATUS_FLAGS_SNAPSHOT_FREEZER) {
6949 is_freezer_snapshot = true;
6950 error = memorystatus_get_jetsam_snapshot_freezer(&snapshot, &buffer_size, size_only);
6951 #endif /* CONFIG_FREEZE */
6952 } else {
6953 /*
6954 * Invalid flag setting.
6955 */
6956 return EINVAL;
6957 }
6958 }
6959
6960 if (error) {
6961 goto out;
6962 }
6963
6964 /*
6965 * Copy the data out to user space and clear the snapshot buffer.
6966 * If working with the jetsam snapshot,
6967 * clearing the buffer means, reset the count.
6968 * If working with an on_demand snapshot
6969 * clearing the buffer means, free it.
6970 * If working with the at_boot snapshot
6971 * there is nothing to clear or update.
6972 * If working with a copy of the snapshot
6973 * there is nothing to clear or update.
6974 * If working with the freezer snapshot
6975 * clearing the buffer means, reset the count.
6976 */
6977 if (!size_only) {
6978 if ((error = copyout(snapshot, buffer, buffer_size)) == 0) {
6979 #if CONFIG_FREEZE
6980 if (is_default_snapshot || is_freezer_snapshot) {
6981 #else
6982 if (is_default_snapshot) {
6983 #endif /* CONFIG_FREEZE */
6984 /*
6985 * The jetsam snapshot is never freed, its count is simply reset.
6986 * However, we make a copy for any parties that might be interested
6987 * in the previous fully populated snapshot.
6988 */
6989 proc_list_lock();
6990 #if DEVELOPMENT || DEBUG
6991 if (memorystatus_snapshot_owner != 0 && memorystatus_snapshot_owner != current_proc()->p_pid) {
6992 /* Snapshot is currently owned by someone else. Don't consume it. */
6993 proc_list_unlock();
6994 goto out;
6995 }
6996 #endif /* (DEVELOPMENT || DEBUG)*/
6997 if (is_default_snapshot) {
6998 memcpy(memorystatus_jetsam_snapshot_copy, memorystatus_jetsam_snapshot, memorystatus_jetsam_snapshot_size);
6999 memorystatus_jetsam_snapshot_copy_count = memorystatus_jetsam_snapshot_count;
7000 snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
7001 memorystatus_jetsam_snapshot_last_timestamp = 0;
7002 }
7003 #if CONFIG_FREEZE
7004 else if (is_freezer_snapshot) {
7005 memorystatus_jetsam_snapshot_freezer->entry_count = 0;
7006 }
7007 #endif /* CONFIG_FREEZE */
7008 proc_list_unlock();
7009 }
7010 }
7011
7012 if (is_on_demand_snapshot) {
7013 /*
7014 * The on_demand snapshot is always freed,
7015 * even if the copyout failed.
7016 */
7017 if (snapshot) {
7018 kfree(snapshot, buffer_size);
7019 }
7020 }
7021 }
7022
7023 out:
7024 if (error == 0) {
7025 assert(buffer_size <= INT32_MAX);
7026 *retval = (int32_t) buffer_size;
7027 }
7028 return error;
7029 }
7030
7031 #if DEVELOPMENT || DEBUG
7032 static int
7033 memorystatus_cmd_set_jetsam_snapshot_ownership(int32_t flags)
7034 {
7035 int error = EINVAL;
7036 proc_t caller = current_proc();
7037 assert(caller != kernproc);
7038 proc_list_lock();
7039 if (flags & MEMORYSTATUS_FLAGS_SNAPSHOT_TAKE_OWNERSHIP) {
7040 if (memorystatus_snapshot_owner == 0) {
7041 memorystatus_snapshot_owner = caller->p_pid;
7042 error = 0;
7043 } else if (memorystatus_snapshot_owner == caller->p_pid) {
7044 error = 0;
7045 } else {
7046 /* We don't allow ownership to be taken from another proc. */
7047 error = EBUSY;
7048 }
7049 } else if (flags & MEMORYSTATUS_FLAGS_SNAPSHOT_DROP_OWNERSHIP) {
7050 if (memorystatus_snapshot_owner == caller->p_pid) {
7051 memorystatus_snapshot_owner = 0;
7052 error = 0;
7053 } else if (memorystatus_snapshot_owner != 0) {
7054 /* We don't allow ownership to be taken from another proc. */
7055 error = EPERM;
7056 }
7057 }
7058 proc_list_unlock();
7059
7060 return error;
7061 }
7062 #endif /* DEVELOPMENT || DEBUG */
7063
7064 /*
7065 * Routine: memorystatus_cmd_grp_set_priorities
7066 * Purpose: Update priorities for a group of processes.
7067 *
7068 * [priority]
7069 * Move each process out of its effective priority
7070 * band and into a new priority band.
7071 * Maintains relative order from lowest to highest priority.
7072 * In single band, maintains relative order from head to tail.
7073 *
7074 * eg: before [effectivepriority | pid]
7075 * [18 | p101 ]
7076 * [17 | p55, p67, p19 ]
7077 * [12 | p103 p10 ]
7078 * [ 7 | p25 ]
7079 * [ 0 | p71, p82, ]
7080 *
7081 * after [ new band | pid]
7082 * [ xxx | p71, p82, p25, p103, p10, p55, p67, p19, p101]
7083 *
7084 * Returns: 0 on success, else non-zero.
7085 *
7086 * Caveat: We know there is a race window regarding recycled pids.
7087 * A process could be killed before the kernel can act on it here.
7088 * If a pid cannot be found in any of the jetsam priority bands,
7089 * then we simply ignore it. No harm.
7090 * But, if the pid has been recycled then it could be an issue.
7091 * In that scenario, we might move an unsuspecting process to the new
7092 * priority band. It's not clear how the kernel can safeguard
7093 * against this, but it would be an extremely rare case anyway.
7094 * The caller of this api might avoid such race conditions by
7095 * ensuring that the processes passed in the pid list are suspended.
7096 */
7097
7098
7099 static int
7100 memorystatus_cmd_grp_set_priorities(user_addr_t buffer, size_t buffer_size)
7101 {
7102 /*
7103 * We only handle setting priority
7104 * per process
7105 */
7106
7107 int error = 0;
7108 memorystatus_properties_entry_v1_t *entries = NULL;
7109 size_t entry_count = 0;
7110
7111 /* This will be the ordered proc list */
7112 typedef struct memorystatus_internal_properties {
7113 proc_t proc;
7114 int32_t priority;
7115 } memorystatus_internal_properties_t;
7116
7117 memorystatus_internal_properties_t *table = NULL;
7118 size_t table_size = 0;
7119 uint32_t table_count = 0;
7120
7121 size_t i = 0;
7122 uint32_t bucket_index = 0;
7123 boolean_t head_insert;
7124 int32_t new_priority;
7125
7126 proc_t p;
7127
7128 /* Verify inputs */
7129 if ((buffer == USER_ADDR_NULL) || (buffer_size == 0)) {
7130 error = EINVAL;
7131 goto out;
7132 }
7133
7134 entry_count = (buffer_size / sizeof(memorystatus_properties_entry_v1_t));
7135 if (entry_count == 0) {
7136 /* buffer size was not large enough for a single entry */
7137 error = EINVAL;
7138 goto out;
7139 }
7140
7141 if ((entries = kheap_alloc(KHEAP_TEMP, buffer_size, Z_WAITOK)) == NULL) {
7142 error = ENOMEM;
7143 goto out;
7144 }
7145
7146 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY, entry_count, 0, 0, 0);
7147
7148 if ((error = copyin(buffer, entries, buffer_size)) != 0) {
7149 goto out;
7150 }
7151
7152 /* Verify sanity of input priorities */
7153 if (entries[0].version == MEMORYSTATUS_MPE_VERSION_1) {
7154 if ((buffer_size % MEMORYSTATUS_MPE_VERSION_1_SIZE) != 0) {
7155 error = EINVAL;
7156 goto out;
7157 }
7158 } else {
7159 error = EINVAL;
7160 goto out;
7161 }
7162
7163 for (i = 0; i < entry_count; i++) {
7164 if (entries[i].priority == -1) {
7165 /* Use as shorthand for default priority */
7166 entries[i].priority = JETSAM_PRIORITY_DEFAULT;
7167 } else if ((entries[i].priority == system_procs_aging_band) || (entries[i].priority == applications_aging_band)) {
7168 /* Both the aging bands are reserved for internal use;
7169 * if requested, adjust to JETSAM_PRIORITY_IDLE. */
7170 entries[i].priority = JETSAM_PRIORITY_IDLE;
7171 } else if (entries[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
7172 /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle
7173 * queue */
7174 /* Deal with this later */
7175 } else if ((entries[i].priority < 0) || (entries[i].priority >= MEMSTAT_BUCKET_COUNT)) {
7176 /* Sanity check */
7177 error = EINVAL;
7178 goto out;
7179 }
7180 }
7181
7182 table_size = sizeof(memorystatus_internal_properties_t) * entry_count;
7183 if ((table = kheap_alloc(KHEAP_TEMP, table_size, Z_WAITOK | Z_ZERO)) == NULL) {
7184 error = ENOMEM;
7185 goto out;
7186 }
7187
7188
7189 /*
7190 * For each jetsam bucket entry, spin through the input property list.
7191 * When a matching pid is found, populate an adjacent table with the
7192 * appropriate proc pointer and new property values.
7193 * This traversal automatically preserves order from lowest
7194 * to highest priority.
7195 */
7196
7197 bucket_index = 0;
7198
7199 proc_list_lock();
7200
7201 /* Create the ordered table */
7202 p = memorystatus_get_first_proc_locked(&bucket_index, TRUE);
7203 while (p && (table_count < entry_count)) {
7204 for (i = 0; i < entry_count; i++) {
7205 if (p->p_pid == entries[i].pid) {
7206 /* Build the table data */
7207 table[table_count].proc = p;
7208 table[table_count].priority = entries[i].priority;
7209 table_count++;
7210 break;
7211 }
7212 }
7213 p = memorystatus_get_next_proc_locked(&bucket_index, p, TRUE);
7214 }
7215
7216 /* We now have ordered list of procs ready to move */
7217 for (i = 0; i < table_count; i++) {
7218 p = table[i].proc;
7219 assert(p != NULL);
7220
7221 /* Allow head inserts -- but relative order is now */
7222 if (table[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
7223 new_priority = JETSAM_PRIORITY_IDLE;
7224 head_insert = true;
7225 } else {
7226 new_priority = table[i].priority;
7227 head_insert = false;
7228 }
7229
7230 /* Not allowed */
7231 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
7232 continue;
7233 }
7234
7235 /*
7236 * Take appropriate steps if moving proc out of
7237 * either of the aging bands.
7238 */
7239 if ((p->p_memstat_effectivepriority == system_procs_aging_band) || (p->p_memstat_effectivepriority == applications_aging_band)) {
7240 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
7241 }
7242
7243 memorystatus_update_priority_locked(p, new_priority, head_insert, false);
7244 }
7245
7246 proc_list_unlock();
7247
7248 /*
7249 * if (table_count != entry_count)
7250 * then some pids were not found in a jetsam band.
7251 * harmless but interesting...
7252 */
7253 out:
7254 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY, entry_count, table_count, 0, 0);
7255
7256 if (entries) {
7257 kheap_free(KHEAP_TEMP, entries, buffer_size);
7258 }
7259 if (table) {
7260 kheap_free(KHEAP_TEMP, table, table_size);
7261 }
7262
7263 return error;
7264 }
7265
7266 memorystatus_internal_probabilities_t *memorystatus_global_probabilities_table = NULL;
7267 size_t memorystatus_global_probabilities_size = 0;
7268
7269 static int
7270 memorystatus_cmd_grp_set_probabilities(user_addr_t buffer, size_t buffer_size)
7271 {
7272 int error = 0;
7273 memorystatus_properties_entry_v1_t *entries = NULL;
7274 size_t entry_count = 0, i = 0;
7275 memorystatus_internal_probabilities_t *tmp_table_new = NULL, *tmp_table_old = NULL;
7276 size_t tmp_table_new_size = 0, tmp_table_old_size = 0;
7277
7278 /* Verify inputs */
7279 if ((buffer == USER_ADDR_NULL) || (buffer_size == 0)) {
7280 error = EINVAL;
7281 goto out;
7282 }
7283
7284 entry_count = (buffer_size / sizeof(memorystatus_properties_entry_v1_t));
7285
7286 if ((entries = kheap_alloc(KHEAP_TEMP, buffer_size, Z_WAITOK)) == NULL) {
7287 error = ENOMEM;
7288 goto out;
7289 }
7290
7291 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY, entry_count, 0, 0, 0);
7292
7293 if ((error = copyin(buffer, entries, buffer_size)) != 0) {
7294 goto out;
7295 }
7296
7297 if (entries[0].version == MEMORYSTATUS_MPE_VERSION_1) {
7298 if ((buffer_size % MEMORYSTATUS_MPE_VERSION_1_SIZE) != 0) {
7299 error = EINVAL;
7300 goto out;
7301 }
7302 } else {
7303 error = EINVAL;
7304 goto out;
7305 }
7306
7307 /* Verify sanity of input priorities */
7308 for (i = 0; i < entry_count; i++) {
7309 /*
7310 * 0 - low probability of use.
7311 * 1 - high probability of use.
7312 *
7313 * Keeping this field an int (& not a bool) to allow
7314 * us to experiment with different values/approaches
7315 * later on.
7316 */
7317 if (entries[i].use_probability > 1) {
7318 error = EINVAL;
7319 goto out;
7320 }
7321 }
7322
7323 tmp_table_new_size = sizeof(memorystatus_internal_probabilities_t) * entry_count;
7324
7325 if ((tmp_table_new = kalloc_flags(tmp_table_new_size, Z_WAITOK | Z_ZERO)) == NULL) {
7326 error = ENOMEM;
7327 goto out;
7328 }
7329
7330 proc_list_lock();
7331
7332 if (memorystatus_global_probabilities_table) {
7333 tmp_table_old = memorystatus_global_probabilities_table;
7334 tmp_table_old_size = memorystatus_global_probabilities_size;
7335 }
7336
7337 memorystatus_global_probabilities_table = tmp_table_new;
7338 memorystatus_global_probabilities_size = tmp_table_new_size;
7339 tmp_table_new = NULL;
7340
7341 for (i = 0; i < entry_count; i++) {
7342 /* Build the table data */
7343 strlcpy(memorystatus_global_probabilities_table[i].proc_name, entries[i].proc_name, MAXCOMLEN + 1);
7344 memorystatus_global_probabilities_table[i].use_probability = entries[i].use_probability;
7345 }
7346
7347 proc_list_unlock();
7348
7349 out:
7350 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY, entry_count, tmp_table_new_size, 0, 0);
7351
7352 if (entries) {
7353 kheap_free(KHEAP_TEMP, entries, buffer_size);
7354 entries = NULL;
7355 }
7356
7357 if (tmp_table_old) {
7358 kfree(tmp_table_old, tmp_table_old_size);
7359 tmp_table_old = NULL;
7360 }
7361
7362 return error;
7363 }
7364
7365 static int
7366 memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7367 {
7368 int error = 0;
7369
7370 if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY) == MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY) {
7371 error = memorystatus_cmd_grp_set_priorities(buffer, buffer_size);
7372 } else if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY) == MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY) {
7373 error = memorystatus_cmd_grp_set_probabilities(buffer, buffer_size);
7374 } else {
7375 error = EINVAL;
7376 }
7377
7378 return error;
7379 }
7380
7381 /*
7382 * This routine is used to update a process's jetsam priority position and stored user_data.
7383 * It is not used for the setting of memory limits, which is why the last 6 args to the
7384 * memorystatus_update() call are 0 or FALSE.
7385 *
7386 * Flags passed into this call are used to distinguish the motivation behind a jetsam priority
7387 * transition. By default, the kernel updates the process's original requested priority when
7388 * no flag is passed. But when the MEMORYSTATUS_SET_PRIORITY_ASSERTION flag is used, the kernel
7389 * updates the process's assertion driven priority.
7390 *
7391 * The assertion flag was introduced for use by the device's assertion mediator (eg: runningboardd).
7392 * When an assertion is controlling a process's jetsam priority, it may conflict with that process's
7393 * dirty/clean (active/inactive) jetsam state. The kernel attempts to resolve a priority transition
7394 * conflict by reviewing the process state and then choosing the maximum jetsam band at play,
7395 * eg: requested priority versus assertion priority.
7396 */
7397
7398 static int
7399 memorystatus_cmd_set_priority_properties(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7400 {
7401 int error = 0;
7402 boolean_t is_assertion = FALSE; /* priority is driven by an assertion */
7403 memorystatus_priority_properties_t mpp_entry;
7404
7405 /* Validate inputs */
7406 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_priority_properties_t))) {
7407 return EINVAL;
7408 }
7409
7410 /* Validate flags */
7411 if (flags == 0) {
7412 /*
7413 * Default. This path updates requestedpriority.
7414 */
7415 } else {
7416 if (flags & ~(MEMORYSTATUS_SET_PRIORITY_ASSERTION)) {
7417 /*
7418 * Unsupported bit set in flag.
7419 */
7420 return EINVAL;
7421 } else if (flags & MEMORYSTATUS_SET_PRIORITY_ASSERTION) {
7422 is_assertion = TRUE;
7423 }
7424 }
7425
7426 error = copyin(buffer, &mpp_entry, buffer_size);
7427
7428 if (error == 0) {
7429 proc_t p;
7430
7431 p = proc_find(pid);
7432 if (!p) {
7433 return ESRCH;
7434 }
7435
7436 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
7437 proc_rele(p);
7438 return EPERM;
7439 }
7440
7441 if (is_assertion) {
7442 os_log(OS_LOG_DEFAULT, "memorystatus: set assertion priority(%d) target %s:%d\n",
7443 mpp_entry.priority, (*p->p_name ? p->p_name : "unknown"), p->p_pid);
7444 }
7445
7446 error = memorystatus_update(p, mpp_entry.priority, mpp_entry.user_data, is_assertion, FALSE, FALSE, 0, 0, FALSE, FALSE);
7447 proc_rele(p);
7448 }
7449
7450 return error;
7451 }
7452
7453 static int
7454 memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7455 {
7456 int error = 0;
7457 memorystatus_memlimit_properties_t mmp_entry;
7458
7459 /* Validate inputs */
7460 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_memlimit_properties_t))) {
7461 return EINVAL;
7462 }
7463
7464 error = copyin(buffer, &mmp_entry, buffer_size);
7465
7466 if (error == 0) {
7467 error = memorystatus_set_memlimit_properties(pid, &mmp_entry);
7468 }
7469
7470 return error;
7471 }
7472
7473 static void
7474 memorystatus_get_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t* p_entry)
7475 {
7476 memset(p_entry, 0, sizeof(memorystatus_memlimit_properties_t));
7477
7478 if (p->p_memstat_memlimit_active > 0) {
7479 p_entry->memlimit_active = p->p_memstat_memlimit_active;
7480 } else {
7481 task_convert_phys_footprint_limit(-1, &p_entry->memlimit_active);
7482 }
7483
7484 if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL) {
7485 p_entry->memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7486 }
7487
7488 /*
7489 * Get the inactive limit and attributes
7490 */
7491 if (p->p_memstat_memlimit_inactive <= 0) {
7492 task_convert_phys_footprint_limit(-1, &p_entry->memlimit_inactive);
7493 } else {
7494 p_entry->memlimit_inactive = p->p_memstat_memlimit_inactive;
7495 }
7496 if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) {
7497 p_entry->memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7498 }
7499 }
7500
7501 /*
7502 * When getting the memlimit settings, we can't simply call task_get_phys_footprint_limit().
7503 * That gets the proc's cached memlimit and there is no guarantee that the active/inactive
7504 * limits will be the same in the no-limit case. Instead we convert limits <= 0 using
7505 * task_convert_phys_footprint_limit(). It computes the same limit value that would be written
7506 * to the task's ledgers via task_set_phys_footprint_limit().
7507 */
7508 static int
7509 memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7510 {
7511 memorystatus_memlimit_properties2_t mmp_entry;
7512
7513 /* Validate inputs */
7514 if ((pid == 0) || (buffer == USER_ADDR_NULL) ||
7515 ((buffer_size != sizeof(memorystatus_memlimit_properties_t)) &&
7516 (buffer_size != sizeof(memorystatus_memlimit_properties2_t)))) {
7517 return EINVAL;
7518 }
7519
7520 memset(&mmp_entry, 0, sizeof(memorystatus_memlimit_properties2_t));
7521
7522 proc_t p = proc_find(pid);
7523 if (!p) {
7524 return ESRCH;
7525 }
7526
7527 /*
7528 * Get the active limit and attributes.
7529 * No locks taken since we hold a reference to the proc.
7530 */
7531
7532 memorystatus_get_memlimit_properties_internal(p, &mmp_entry.v1);
7533
7534 #if CONFIG_JETSAM
7535 #if DEVELOPMENT || DEBUG
7536 /*
7537 * Get the limit increased via SPI
7538 */
7539 mmp_entry.memlimit_increase = roundToNearestMB(p->p_memlimit_increase);
7540 mmp_entry.memlimit_increase_bytes = p->p_memlimit_increase;
7541 #endif /* DEVELOPMENT || DEBUG */
7542 #endif /* CONFIG_JETSAM */
7543
7544 proc_rele(p);
7545
7546 int error = copyout(&mmp_entry, buffer, buffer_size);
7547
7548 return error;
7549 }
7550
7551
7552 /*
7553 * SPI for kbd - pr24956468
7554 * This is a very simple snapshot that calculates how much a
7555 * process's phys_footprint exceeds a specific memory limit.
7556 * Only the inactive memory limit is supported for now.
7557 * The delta is returned as bytes in excess or zero.
7558 */
7559 static int
7560 memorystatus_cmd_get_memlimit_excess_np(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7561 {
7562 int error = 0;
7563 uint64_t footprint_in_bytes = 0;
7564 uint64_t delta_in_bytes = 0;
7565 int32_t memlimit_mb = 0;
7566 uint64_t memlimit_bytes = 0;
7567
7568 /* Validate inputs */
7569 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(uint64_t)) || (flags != 0)) {
7570 return EINVAL;
7571 }
7572
7573 proc_t p = proc_find(pid);
7574 if (!p) {
7575 return ESRCH;
7576 }
7577
7578 /*
7579 * Get the inactive limit.
7580 * No locks taken since we hold a reference to the proc.
7581 */
7582
7583 if (p->p_memstat_memlimit_inactive <= 0) {
7584 task_convert_phys_footprint_limit(-1, &memlimit_mb);
7585 } else {
7586 memlimit_mb = p->p_memstat_memlimit_inactive;
7587 }
7588
7589 footprint_in_bytes = get_task_phys_footprint(p->task);
7590
7591 proc_rele(p);
7592
7593 memlimit_bytes = memlimit_mb * 1024 * 1024; /* MB to bytes */
7594
7595 /*
7596 * Computed delta always returns >= 0 bytes
7597 */
7598 if (footprint_in_bytes > memlimit_bytes) {
7599 delta_in_bytes = footprint_in_bytes - memlimit_bytes;
7600 }
7601
7602 error = copyout(&delta_in_bytes, buffer, sizeof(delta_in_bytes));
7603
7604 return error;
7605 }
7606
7607
7608 static int
7609 memorystatus_cmd_get_pressure_status(int32_t *retval)
7610 {
7611 int error;
7612
7613 /* Need privilege for check */
7614 error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
7615 if (error) {
7616 return error;
7617 }
7618
7619 /* Inherently racy, so it's not worth taking a lock here */
7620 *retval = (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
7621
7622 return error;
7623 }
7624
7625 int
7626 memorystatus_get_pressure_status_kdp()
7627 {
7628 return (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
7629 }
7630
7631 /*
7632 * Every process, including a P_MEMSTAT_INTERNAL process (currently only pid 1), is allowed to set a HWM.
7633 *
7634 * This call is inflexible -- it does not distinguish between active/inactive, fatal/non-fatal
7635 * So, with 2-level HWM preserving previous behavior will map as follows.
7636 * - treat the limit passed in as both an active and inactive limit.
7637 * - treat the is_fatal_limit flag as though it applies to both active and inactive limits.
7638 *
7639 * When invoked via MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK
7640 * - the is_fatal_limit is FALSE, meaning the active and inactive limits are non-fatal/soft
7641 * - so mapping is (active/non-fatal, inactive/non-fatal)
7642 *
7643 * When invoked via MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT
7644 * - the is_fatal_limit is TRUE, meaning the process's active and inactive limits are fatal/hard
7645 * - so mapping is (active/fatal, inactive/fatal)
7646 */
7647
7648 #if CONFIG_JETSAM
7649 static int
7650 memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit)
7651 {
7652 int error = 0;
7653 memorystatus_memlimit_properties_t entry;
7654
7655 entry.memlimit_active = high_water_mark;
7656 entry.memlimit_active_attr = 0;
7657 entry.memlimit_inactive = high_water_mark;
7658 entry.memlimit_inactive_attr = 0;
7659
7660 if (is_fatal_limit == TRUE) {
7661 entry.memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7662 entry.memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7663 }
7664
7665 error = memorystatus_set_memlimit_properties(pid, &entry);
7666 return error;
7667 }
7668 #endif /* CONFIG_JETSAM */
7669
7670 static int
7671 memorystatus_set_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t *p_entry)
7672 {
7673 int error = 0;
7674
7675 LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
7676
7677 /*
7678 * Store the active limit variants in the proc.
7679 */
7680 SET_ACTIVE_LIMITS_LOCKED(p, p_entry->memlimit_active, p_entry->memlimit_active_attr);
7681
7682 /*
7683 * Store the inactive limit variants in the proc.
7684 */
7685 SET_INACTIVE_LIMITS_LOCKED(p, p_entry->memlimit_inactive, p_entry->memlimit_inactive_attr);
7686
7687 /*
7688 * Enforce appropriate limit variant by updating the cached values
7689 * and writing the ledger.
7690 * Limit choice is based on process active/inactive state.
7691 */
7692
7693 if (memorystatus_highwater_enabled) {
7694 boolean_t is_fatal;
7695 boolean_t use_active;
7696
7697 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
7698 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
7699 use_active = TRUE;
7700 } else {
7701 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
7702 use_active = FALSE;
7703 }
7704
7705 /* Enforce the limit by writing to the ledgers */
7706 error = (task_set_phys_footprint_limit_internal(p->task, ((p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1), NULL, use_active, is_fatal) == 0) ? 0 : EINVAL;
7707
7708 MEMORYSTATUS_DEBUG(3, "memorystatus_set_memlimit_properties: new limit on pid %d (%dMB %s) current priority (%d) dirty_state?=0x%x %s\n",
7709 p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
7710 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, p->p_memstat_dirty,
7711 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
7712 DTRACE_MEMORYSTATUS2(memorystatus_set_memlimit, proc_t, p, int32_t, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1));
7713 }
7714
7715 return error;
7716 }
7717
7718 static int
7719 memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry)
7720 {
7721 memorystatus_memlimit_properties_t set_entry;
7722
7723 proc_t p = proc_find(pid);
7724 if (!p) {
7725 return ESRCH;
7726 }
7727
7728 /*
7729 * Check for valid attribute flags.
7730 */
7731 const uint32_t valid_attrs = MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7732 if ((entry->memlimit_active_attr & (~valid_attrs)) != 0) {
7733 proc_rele(p);
7734 return EINVAL;
7735 }
7736 if ((entry->memlimit_inactive_attr & (~valid_attrs)) != 0) {
7737 proc_rele(p);
7738 return EINVAL;
7739 }
7740
7741 /*
7742 * Setup the active memlimit properties
7743 */
7744 set_entry.memlimit_active = entry->memlimit_active;
7745 set_entry.memlimit_active_attr = entry->memlimit_active_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7746
7747 /*
7748 * Setup the inactive memlimit properties
7749 */
7750 set_entry.memlimit_inactive = entry->memlimit_inactive;
7751 set_entry.memlimit_inactive_attr = entry->memlimit_inactive_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7752
7753 /*
7754 * Setting a limit of <= 0 implies that the process has no
7755 * high-water-mark and has no per-task-limit. That means
7756 * the system_wide task limit is in place, which by the way,
7757 * is always fatal.
7758 */
7759
7760 if (set_entry.memlimit_active <= 0) {
7761 /*
7762 * Enforce the fatal system_wide task limit while process is active.
7763 */
7764 set_entry.memlimit_active = -1;
7765 set_entry.memlimit_active_attr = MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7766 }
7767 #if CONFIG_JETSAM
7768 #if DEVELOPMENT || DEBUG
7769 else {
7770 /* add the current increase to it, for roots */
7771 set_entry.memlimit_active += roundToNearestMB(p->p_memlimit_increase);
7772 }
7773 #endif /* DEVELOPMENT || DEBUG */
7774 #endif /* CONFIG_JETSAM */
7775
7776 if (set_entry.memlimit_inactive <= 0) {
7777 /*
7778 * Enforce the fatal system_wide task limit while process is inactive.
7779 */
7780 set_entry.memlimit_inactive = -1;
7781 set_entry.memlimit_inactive_attr = MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7782 }
7783 #if CONFIG_JETSAM
7784 #if DEVELOPMENT || DEBUG
7785 else {
7786 /* add the current increase to it, for roots */
7787 set_entry.memlimit_inactive += roundToNearestMB(p->p_memlimit_increase);
7788 }
7789 #endif /* DEVELOPMENT || DEBUG */
7790 #endif /* CONFIG_JETSAM */
7791
7792 proc_list_lock();
7793
7794 int error = memorystatus_set_memlimit_properties_internal(p, &set_entry);
7795
7796 proc_list_unlock();
7797 proc_rele(p);
7798
7799 return error;
7800 }
7801
7802 /*
7803 * Returns the jetsam priority (effective or requested) of the process
7804 * associated with this task.
7805 */
7806 int
7807 proc_get_memstat_priority(proc_t p, boolean_t effective_priority)
7808 {
7809 if (p) {
7810 if (effective_priority) {
7811 return p->p_memstat_effectivepriority;
7812 } else {
7813 return p->p_memstat_requestedpriority;
7814 }
7815 }
7816 return 0;
7817 }
7818
7819 static int
7820 memorystatus_get_process_is_managed(pid_t pid, int *is_managed)
7821 {
7822 proc_t p = NULL;
7823
7824 /* Validate inputs */
7825 if (pid == 0) {
7826 return EINVAL;
7827 }
7828
7829 p = proc_find(pid);
7830 if (!p) {
7831 return ESRCH;
7832 }
7833
7834 proc_list_lock();
7835 *is_managed = ((p->p_memstat_state & P_MEMSTAT_MANAGED) ? 1 : 0);
7836 proc_rele_locked(p);
7837 proc_list_unlock();
7838
7839 return 0;
7840 }
7841
7842 static int
7843 memorystatus_set_process_is_managed(pid_t pid, boolean_t set_managed)
7844 {
7845 proc_t p = NULL;
7846
7847 /* Validate inputs */
7848 if (pid == 0) {
7849 return EINVAL;
7850 }
7851
7852 p = proc_find(pid);
7853 if (!p) {
7854 return ESRCH;
7855 }
7856
7857 proc_list_lock();
7858 if (set_managed == TRUE) {
7859 p->p_memstat_state |= P_MEMSTAT_MANAGED;
7860 /*
7861 * The P_MEMSTAT_MANAGED bit is set by assertiond for Apps.
7862 * Also opt them in to being frozen (they might have started
7863 * off with the P_MEMSTAT_FREEZE_DISABLED bit set.)
7864 */
7865 p->p_memstat_state &= ~P_MEMSTAT_FREEZE_DISABLED;
7866 } else {
7867 p->p_memstat_state &= ~P_MEMSTAT_MANAGED;
7868 }
7869 proc_rele_locked(p);
7870 proc_list_unlock();
7871
7872 return 0;
7873 }
7874
7875 int
7876 memorystatus_control(struct proc *p __unused, struct memorystatus_control_args *args, int *ret)
7877 {
7878 int error = EINVAL;
7879 boolean_t skip_auth_check = FALSE;
7880 os_reason_t jetsam_reason = OS_REASON_NULL;
7881
7882 #if !CONFIG_JETSAM
7883 #pragma unused(ret)
7884 #pragma unused(jetsam_reason)
7885 #endif
7886
7887 /* We don't need entitlements if we're setting/ querying the freeze preference for a process. Skip the check below. */
7888 if (args->command == MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE || args->command == MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE) {
7889 skip_auth_check = TRUE;
7890 }
7891
7892 /* Need to be root or have entitlement. */
7893 if (!kauth_cred_issuser(kauth_cred_get()) && !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT) && !skip_auth_check) {
7894 error = EPERM;
7895 goto out;
7896 }
7897
7898 /*
7899 * Sanity check.
7900 * Do not enforce it for snapshots.
7901 */
7902 if (args->command != MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT) {
7903 if (args->buffersize > MEMORYSTATUS_BUFFERSIZE_MAX) {
7904 error = EINVAL;
7905 goto out;
7906 }
7907 }
7908
7909 switch (args->command) {
7910 case MEMORYSTATUS_CMD_GET_PRIORITY_LIST:
7911 error = memorystatus_cmd_get_priority_list(args->pid, args->buffer, args->buffersize, ret);
7912 break;
7913 case MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES:
7914 error = memorystatus_cmd_set_priority_properties(args->pid, args->flags, args->buffer, args->buffersize, ret);
7915 break;
7916 case MEMORYSTATUS_CMD_SET_MEMLIMIT_PROPERTIES:
7917 error = memorystatus_cmd_set_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
7918 break;
7919 case MEMORYSTATUS_CMD_GET_MEMLIMIT_PROPERTIES:
7920 error = memorystatus_cmd_get_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
7921 break;
7922 case MEMORYSTATUS_CMD_GET_MEMLIMIT_EXCESS:
7923 error = memorystatus_cmd_get_memlimit_excess_np(args->pid, args->flags, args->buffer, args->buffersize, ret);
7924 break;
7925 case MEMORYSTATUS_CMD_GRP_SET_PROPERTIES:
7926 error = memorystatus_cmd_grp_set_properties((int32_t)args->flags, args->buffer, args->buffersize, ret);
7927 break;
7928 case MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT:
7929 error = memorystatus_cmd_get_jetsam_snapshot((int32_t)args->flags, args->buffer, args->buffersize, ret);
7930 break;
7931 #if DEVELOPMENT || DEBUG
7932 case MEMORYSTATUS_CMD_SET_JETSAM_SNAPSHOT_OWNERSHIP:
7933 error = memorystatus_cmd_set_jetsam_snapshot_ownership((int32_t) args->flags);
7934 break;
7935 #endif
7936 case MEMORYSTATUS_CMD_GET_PRESSURE_STATUS:
7937 error = memorystatus_cmd_get_pressure_status(ret);
7938 break;
7939 #if CONFIG_JETSAM
7940 case MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK:
7941 /*
7942 * This call does not distinguish between active and inactive limits.
7943 * Default behavior in 2-level HWM world is to set both.
7944 * Non-fatal limit is also assumed for both.
7945 */
7946 error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, FALSE);
7947 break;
7948 case MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT:
7949 /*
7950 * This call does not distinguish between active and inactive limits.
7951 * Default behavior in 2-level HWM world is to set both.
7952 * Fatal limit is also assumed for both.
7953 */
7954 error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, TRUE);
7955 break;
7956 #endif /* CONFIG_JETSAM */
7957 /* Test commands */
7958 #if DEVELOPMENT || DEBUG
7959 case MEMORYSTATUS_CMD_TEST_JETSAM:
7960 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_GENERIC);
7961 if (jetsam_reason == OS_REASON_NULL) {
7962 printf("memorystatus_control: failed to allocate jetsam reason\n");
7963 }
7964
7965 error = memorystatus_kill_process_sync(args->pid, kMemorystatusKilled, jetsam_reason) ? 0 : EINVAL;
7966 break;
7967 case MEMORYSTATUS_CMD_TEST_JETSAM_SORT:
7968 error = memorystatus_cmd_test_jetsam_sort(args->pid, (int32_t)args->flags, args->buffer, args->buffersize);
7969 break;
7970 #if CONFIG_JETSAM
7971 case MEMORYSTATUS_CMD_SET_JETSAM_PANIC_BITS:
7972 error = memorystatus_cmd_set_panic_bits(args->buffer, args->buffersize);
7973 break;
7974 #endif /* CONFIG_JETSAM */
7975 #else /* DEVELOPMENT || DEBUG */
7976 #pragma unused(jetsam_reason)
7977 #endif /* DEVELOPMENT || DEBUG */
7978 case MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_ENABLE:
7979 if (memorystatus_aggressive_jetsam_lenient_allowed == FALSE) {
7980 #if DEVELOPMENT || DEBUG
7981 printf("Enabling Lenient Mode\n");
7982 #endif /* DEVELOPMENT || DEBUG */
7983
7984 memorystatus_aggressive_jetsam_lenient_allowed = TRUE;
7985 memorystatus_aggressive_jetsam_lenient = TRUE;
7986 error = 0;
7987 }
7988 break;
7989 case MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_DISABLE:
7990 #if DEVELOPMENT || DEBUG
7991 printf("Disabling Lenient mode\n");
7992 #endif /* DEVELOPMENT || DEBUG */
7993 memorystatus_aggressive_jetsam_lenient_allowed = FALSE;
7994 memorystatus_aggressive_jetsam_lenient = FALSE;
7995 error = 0;
7996 break;
7997 case MEMORYSTATUS_CMD_GET_AGGRESSIVE_JETSAM_LENIENT_MODE:
7998 *ret = (memorystatus_aggressive_jetsam_lenient ? 1 : 0);
7999 error = 0;
8000 break;
8001 case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE:
8002 case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE:
8003 error = memorystatus_low_mem_privileged_listener(args->command);
8004 break;
8005
8006 case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE:
8007 case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE:
8008 error = memorystatus_update_inactive_jetsam_priority_band(args->pid, args->command, JETSAM_PRIORITY_ELEVATED_INACTIVE, args->flags ? TRUE : FALSE);
8009 break;
8010 case MEMORYSTATUS_CMD_SET_PROCESS_IS_MANAGED:
8011 error = memorystatus_set_process_is_managed(args->pid, args->flags);
8012 break;
8013
8014 case MEMORYSTATUS_CMD_GET_PROCESS_IS_MANAGED:
8015 error = memorystatus_get_process_is_managed(args->pid, ret);
8016 break;
8017
8018 #if CONFIG_FREEZE
8019 case MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE:
8020 error = memorystatus_set_process_is_freezable(args->pid, args->flags ? TRUE : FALSE);
8021 break;
8022
8023 case MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE:
8024 error = memorystatus_get_process_is_freezable(args->pid, ret);
8025 break;
8026
8027 case MEMORYSTATUS_CMD_FREEZER_CONTROL:
8028 error = memorystatus_freezer_control(args->flags, args->buffer, args->buffersize, ret);
8029 break;
8030 #endif /* CONFIG_FREEZE */
8031
8032 #if CONFIG_JETSAM
8033 #if DEVELOPMENT || DEBUG
8034 case MEMORYSTATUS_CMD_INCREASE_JETSAM_TASK_LIMIT:
8035 error = memorystatus_cmd_increase_jetsam_task_limit(args->pid, args->flags);
8036 break;
8037 #endif /* DEVELOPMENT || DEBUG */
8038 #endif /* CONFIG_JETSAM */
8039
8040 default:
8041 break;
8042 }
8043
8044 out:
8045 return error;
8046 }
8047
8048 /* Coalition support */
8049
8050 /* sorting info for a particular priority bucket */
8051 typedef struct memstat_sort_info {
8052 coalition_t msi_coal;
8053 uint64_t msi_page_count;
8054 pid_t msi_pid;
8055 int msi_ntasks;
8056 } memstat_sort_info_t;
8057
8058 /*
8059 * qsort from smallest page count to largest page count
8060 *
8061 * return < 0 for a < b
8062 * 0 for a == b
8063 * > 0 for a > b
8064 */
8065 static int
8066 memstat_asc_cmp(const void *a, const void *b)
8067 {
8068 const memstat_sort_info_t *msA = (const memstat_sort_info_t *)a;
8069 const memstat_sort_info_t *msB = (const memstat_sort_info_t *)b;
8070
8071 return (int)((uint64_t)msA->msi_page_count - (uint64_t)msB->msi_page_count);
8072 }
8073
8074 /*
8075 * Return the number of pids rearranged during this sort.
8076 */
8077 static int
8078 memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coal_sort_order)
8079 {
8080 #define MAX_SORT_PIDS 80
8081 #define MAX_COAL_LEADERS 10
8082
8083 unsigned int b = bucket_index;
8084 int nleaders = 0;
8085 int ntasks = 0;
8086 proc_t p = NULL;
8087 coalition_t coal = COALITION_NULL;
8088 int pids_moved = 0;
8089 int total_pids_moved = 0;
8090 int i;
8091
8092 /*
8093 * The system is typically under memory pressure when in this
8094 * path, hence, we want to avoid dynamic memory allocation.
8095 */
8096 memstat_sort_info_t leaders[MAX_COAL_LEADERS];
8097 pid_t pid_list[MAX_SORT_PIDS];
8098
8099 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
8100 return 0;
8101 }
8102
8103 /*
8104 * Clear the array that holds coalition leader information
8105 */
8106 for (i = 0; i < MAX_COAL_LEADERS; i++) {
8107 leaders[i].msi_coal = COALITION_NULL;
8108 leaders[i].msi_page_count = 0; /* will hold total coalition page count */
8109 leaders[i].msi_pid = 0; /* will hold coalition leader pid */
8110 leaders[i].msi_ntasks = 0; /* will hold the number of tasks in a coalition */
8111 }
8112
8113 p = memorystatus_get_first_proc_locked(&b, FALSE);
8114 while (p) {
8115 coal = task_get_coalition(p->task, COALITION_TYPE_JETSAM);
8116 if (coalition_is_leader(p->task, coal)) {
8117 if (nleaders < MAX_COAL_LEADERS) {
8118 int coal_ntasks = 0;
8119 uint64_t coal_page_count = coalition_get_page_count(coal, &coal_ntasks);
8120 leaders[nleaders].msi_coal = coal;
8121 leaders[nleaders].msi_page_count = coal_page_count;
8122 leaders[nleaders].msi_pid = p->p_pid; /* the coalition leader */
8123 leaders[nleaders].msi_ntasks = coal_ntasks;
8124 nleaders++;
8125 } else {
8126 /*
8127 * We've hit MAX_COAL_LEADERS meaning we can handle no more coalitions.
8128 * Abandoned coalitions will linger at the tail of the priority band
8129 * when this sort session ends.
8130 * TODO: should this be an assert?
8131 */
8132 printf("%s: WARNING: more than %d leaders in priority band [%d]\n",
8133 __FUNCTION__, MAX_COAL_LEADERS, bucket_index);
8134 break;
8135 }
8136 }
8137 p = memorystatus_get_next_proc_locked(&b, p, FALSE);
8138 }
8139
8140 if (nleaders == 0) {
8141 /* Nothing to sort */
8142 return 0;
8143 }
8144
8145 /*
8146 * Sort the coalition leader array, from smallest coalition page count
8147 * to largest coalition page count. When inserted in the priority bucket,
8148 * smallest coalition is handled first, resulting in the last to be jetsammed.
8149 */
8150 if (nleaders > 1) {
8151 qsort(leaders, nleaders, sizeof(memstat_sort_info_t), memstat_asc_cmp);
8152 }
8153
8154 #if 0
8155 for (i = 0; i < nleaders; i++) {
8156 printf("%s: coal_leader[%d of %d] pid[%d] pages[%llu] ntasks[%d]\n",
8157 __FUNCTION__, i, nleaders, leaders[i].msi_pid, leaders[i].msi_page_count,
8158 leaders[i].msi_ntasks);
8159 }
8160 #endif
8161
8162 /*
8163 * During coalition sorting, processes in a priority band are rearranged
8164 * by being re-inserted at the head of the queue. So, when handling a
8165 * list, the first process that gets moved to the head of the queue,
8166 * ultimately gets pushed toward the queue tail, and hence, jetsams last.
8167 *
8168 * So, for example, the coalition leader is expected to jetsam last,
8169 * after its coalition members. Therefore, the coalition leader is
8170 * inserted at the head of the queue first.
8171 *
8172 * After processing a coalition, the jetsam order is as follows:
8173 * undefs(jetsam first), extensions, xpc services, leader(jetsam last)
8174 */
8175
8176 /*
8177 * Coalition members are rearranged in the priority bucket here,
8178 * based on their coalition role.
8179 */
8180 total_pids_moved = 0;
8181 for (i = 0; i < nleaders; i++) {
8182 /* a bit of bookkeeping */
8183 pids_moved = 0;
8184
8185 /* Coalition leaders are jetsammed last, so move into place first */
8186 pid_list[0] = leaders[i].msi_pid;
8187 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list, 1);
8188
8189 /* xpc services should jetsam after extensions */
8190 ntasks = coalition_get_pid_list(leaders[i].msi_coal, COALITION_ROLEMASK_XPC,
8191 coal_sort_order, pid_list, MAX_SORT_PIDS);
8192
8193 if (ntasks > 0) {
8194 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
8195 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
8196 }
8197
8198 /* extensions should jetsam after unmarked processes */
8199 ntasks = coalition_get_pid_list(leaders[i].msi_coal, COALITION_ROLEMASK_EXT,
8200 coal_sort_order, pid_list, MAX_SORT_PIDS);
8201
8202 if (ntasks > 0) {
8203 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
8204 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
8205 }
8206
8207 /* undefined coalition members should be the first to jetsam */
8208 ntasks = coalition_get_pid_list(leaders[i].msi_coal, COALITION_ROLEMASK_UNDEF,
8209 coal_sort_order, pid_list, MAX_SORT_PIDS);
8210
8211 if (ntasks > 0) {
8212 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
8213 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
8214 }
8215
8216 #if 0
8217 if (pids_moved == leaders[i].msi_ntasks) {
8218 /*
8219 * All the pids in the coalition were found in this band.
8220 */
8221 printf("%s: pids_moved[%d] equal total coalition ntasks[%d] \n", __FUNCTION__,
8222 pids_moved, leaders[i].msi_ntasks);
8223 } else if (pids_moved > leaders[i].msi_ntasks) {
8224 /*
8225 * Apparently new coalition members showed up during the sort?
8226 */
8227 printf("%s: pids_moved[%d] were greater than expected coalition ntasks[%d] \n", __FUNCTION__,
8228 pids_moved, leaders[i].msi_ntasks);
8229 } else {
8230 /*
8231 * Apparently not all the pids in the coalition were found in this band?
8232 */
8233 printf("%s: pids_moved[%d] were less than expected coalition ntasks[%d] \n", __FUNCTION__,
8234 pids_moved, leaders[i].msi_ntasks);
8235 }
8236 #endif
8237
8238 total_pids_moved += pids_moved;
8239 } /* end for */
8240
8241 return total_pids_moved;
8242 }
8243
8244
8245 /*
8246 * Traverse a list of pids, searching for each within the priority band provided.
8247 * If pid is found, move it to the front of the priority band.
8248 * Never searches outside the priority band provided.
8249 *
8250 * Input:
8251 * bucket_index - jetsam priority band.
8252 * pid_list - pointer to a list of pids.
8253 * list_sz - number of pids in the list.
8254 *
8255 * Pid list ordering is important in that,
8256 * pid_list[n] is expected to jetsam ahead of pid_list[n+1].
8257 * The sort_order is set by the coalition default.
8258 *
8259 * Return:
8260 * the number of pids found and hence moved within the priority band.
8261 */
8262 static int
8263 memorystatus_move_list_locked(unsigned int bucket_index, pid_t *pid_list, int list_sz)
8264 {
8265 memstat_bucket_t *current_bucket;
8266 int i;
8267 int found_pids = 0;
8268
8269 if ((pid_list == NULL) || (list_sz <= 0)) {
8270 return 0;
8271 }
8272
8273 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
8274 return 0;
8275 }
8276
8277 current_bucket = &memstat_bucket[bucket_index];
8278 for (i = 0; i < list_sz; i++) {
8279 unsigned int b = bucket_index;
8280 proc_t p = NULL;
8281 proc_t aProc = NULL;
8282 pid_t aPid;
8283 int list_index;
8284
8285 list_index = ((list_sz - 1) - i);
8286 aPid = pid_list[list_index];
8287
8288 /* never search beyond bucket_index provided */
8289 p = memorystatus_get_first_proc_locked(&b, FALSE);
8290 while (p) {
8291 if (p->p_pid == aPid) {
8292 aProc = p;
8293 break;
8294 }
8295 p = memorystatus_get_next_proc_locked(&b, p, FALSE);
8296 }
8297
8298 if (aProc == NULL) {
8299 /* pid not found in this band, just skip it */
8300 continue;
8301 } else {
8302 TAILQ_REMOVE(&current_bucket->list, aProc, p_memstat_list);
8303 TAILQ_INSERT_HEAD(&current_bucket->list, aProc, p_memstat_list);
8304 found_pids++;
8305 }
8306 }
8307 return found_pids;
8308 }
8309
8310 int
8311 memorystatus_get_proccnt_upto_priority(int32_t max_bucket_index)
8312 {
8313 int32_t i = JETSAM_PRIORITY_IDLE;
8314 int count = 0;
8315
8316 if (max_bucket_index >= MEMSTAT_BUCKET_COUNT) {
8317 return -1;
8318 }
8319
8320 while (i <= max_bucket_index) {
8321 count += memstat_bucket[i++].count;
8322 }
8323
8324 return count;
8325 }
8326
8327 int
8328 memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap)
8329 {
8330 #if !CONFIG_JETSAM
8331 if (!p || (!isApp(p)) || (p->p_memstat_state & (P_MEMSTAT_INTERNAL | P_MEMSTAT_MANAGED))) {
8332 /*
8333 * Ineligible processes OR system processes e.g. launchd.
8334 *
8335 * We also skip processes that have the P_MEMSTAT_MANAGED bit set, i.e.
8336 * they're managed by assertiond. These are iOS apps that have been ported
8337 * to macOS. assertiond might be in the process of modifying the app's
8338 * priority / memory limit - so it might have the proc_list lock, and then try
8339 * to take the task lock. Meanwhile we've entered this function with the task lock
8340 * held, and we need the proc_list lock below. So we'll deadlock with assertiond.
8341 *
8342 * It should be fine to read the P_MEMSTAT_MANAGED bit without the proc_list
8343 * lock here, since assertiond only sets this bit on process launch.
8344 */
8345 return -1;
8346 }
8347
8348 /*
8349 * For macOS only:
8350 * We would like to use memorystatus_update() here to move the processes
8351 * within the bands. Unfortunately memorystatus_update() calls
8352 * memorystatus_update_priority_locked() which uses any band transitions
8353 * as an indication to modify ledgers. For that it needs the task lock
8354 * and since we came into this function with the task lock held, we'll deadlock.
8355 *
8356 * Unfortunately we can't completely disable ledger updates because we still
8357 * need the ledger updates for a subset of processes i.e. daemons.
8358 * When all processes on all platforms support memory limits, we can simply call
8359 * memorystatus_update().
8360 *
8361 * It also has some logic to deal with 'aging' which, currently, is only applicable
8362 * on CONFIG_JETSAM configs. So, till every platform has CONFIG_JETSAM we'll need
8363 * to do this explicit band transition.
8364 */
8365
8366 memstat_bucket_t *current_bucket, *new_bucket;
8367 int32_t priority = 0;
8368
8369 proc_list_lock();
8370
8371 if (((p->p_listflag & P_LIST_EXITED) != 0) ||
8372 (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED))) {
8373 /*
8374 * If the process is on its way out OR
8375 * jetsam has alread tried and failed to kill this process,
8376 * let's skip the whole jetsam band transition.
8377 */
8378 proc_list_unlock();
8379 return 0;
8380 }
8381
8382 if (is_appnap) {
8383 current_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
8384 new_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
8385 priority = JETSAM_PRIORITY_IDLE;
8386 } else {
8387 if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
8388 /*
8389 * It is possible that someone pulled this process
8390 * out of the IDLE band without updating its app-nap
8391 * parameters.
8392 */
8393 proc_list_unlock();
8394 return 0;
8395 }
8396
8397 current_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
8398 new_bucket = &memstat_bucket[p->p_memstat_requestedpriority];
8399 priority = p->p_memstat_requestedpriority;
8400 }
8401
8402 TAILQ_REMOVE(&current_bucket->list, p, p_memstat_list);
8403 current_bucket->count--;
8404 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
8405 current_bucket->relaunch_high_count--;
8406 }
8407 TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
8408 new_bucket->count++;
8409 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
8410 new_bucket->relaunch_high_count++;
8411 }
8412 /*
8413 * Record idle start or idle delta.
8414 */
8415 if (p->p_memstat_effectivepriority == priority) {
8416 /*
8417 * This process is not transitioning between
8418 * jetsam priority buckets. Do nothing.
8419 */
8420 } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
8421 uint64_t now;
8422 /*
8423 * Transitioning out of the idle priority bucket.
8424 * Record idle delta.
8425 */
8426 assert(p->p_memstat_idle_start != 0);
8427 now = mach_absolute_time();
8428 if (now > p->p_memstat_idle_start) {
8429 p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
8430 }
8431 } else if (priority == JETSAM_PRIORITY_IDLE) {
8432 /*
8433 * Transitioning into the idle priority bucket.
8434 * Record idle start.
8435 */
8436 p->p_memstat_idle_start = mach_absolute_time();
8437 }
8438
8439 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CHANGE_PRIORITY), p->p_pid, priority, p->p_memstat_effectivepriority, 0, 0);
8440
8441 p->p_memstat_effectivepriority = priority;
8442
8443 proc_list_unlock();
8444
8445 return 0;
8446
8447 #else /* !CONFIG_JETSAM */
8448 #pragma unused(p)
8449 #pragma unused(is_appnap)
8450 return -1;
8451 #endif /* !CONFIG_JETSAM */
8452 }
8453
8454 uint64_t
8455 memorystatus_available_memory_internal(struct proc *p)
8456 {
8457 #ifdef XNU_TARGET_OS_OSX
8458 if (p->p_memstat_memlimit <= 0) {
8459 return 0;
8460 }
8461 #endif /* XNU_TARGET_OS_OSX */
8462 const uint64_t footprint_in_bytes = get_task_phys_footprint(p->task);
8463 int32_t memlimit_mb;
8464 int64_t memlimit_bytes;
8465 int64_t rc;
8466
8467 if (isApp(p) == FALSE) {
8468 return 0;
8469 }
8470
8471 if (p->p_memstat_memlimit > 0) {
8472 memlimit_mb = p->p_memstat_memlimit;
8473 } else if (task_convert_phys_footprint_limit(-1, &memlimit_mb) != KERN_SUCCESS) {
8474 return 0;
8475 }
8476
8477 if (memlimit_mb <= 0) {
8478 memlimit_bytes = INT_MAX & ~((1 << 20) - 1);
8479 } else {
8480 memlimit_bytes = ((int64_t) memlimit_mb) << 20;
8481 }
8482
8483 rc = memlimit_bytes - footprint_in_bytes;
8484
8485 return (rc >= 0) ? rc : 0;
8486 }
8487
8488 int
8489 memorystatus_available_memory(struct proc *p, __unused struct memorystatus_available_memory_args *args, uint64_t *ret)
8490 {
8491 *ret = memorystatus_available_memory_internal(p);
8492
8493 return 0;
8494 }
8495
8496 #if CONFIG_JETSAM
8497 #if DEVELOPMENT || DEBUG
8498 static int
8499 memorystatus_cmd_increase_jetsam_task_limit(pid_t pid, uint32_t byte_increase)
8500 {
8501 memorystatus_memlimit_properties_t mmp_entry;
8502
8503 /* Validate inputs */
8504 if ((pid == 0) || (byte_increase == 0)) {
8505 return EINVAL;
8506 }
8507
8508 proc_t p = proc_find(pid);
8509
8510 if (!p) {
8511 return ESRCH;
8512 }
8513
8514 const uint32_t current_memlimit_increase = roundToNearestMB(p->p_memlimit_increase);
8515 /* round to page */
8516 const int32_t page_aligned_increase = (int32_t) MIN(round_page(p->p_memlimit_increase + byte_increase), INT32_MAX);
8517
8518 proc_list_lock();
8519
8520 memorystatus_get_memlimit_properties_internal(p, &mmp_entry);
8521
8522 if (mmp_entry.memlimit_active > 0) {
8523 mmp_entry.memlimit_active -= current_memlimit_increase;
8524 mmp_entry.memlimit_active += roundToNearestMB(page_aligned_increase);
8525 }
8526
8527 if (mmp_entry.memlimit_inactive > 0) {
8528 mmp_entry.memlimit_inactive -= current_memlimit_increase;
8529 mmp_entry.memlimit_inactive += roundToNearestMB(page_aligned_increase);
8530 }
8531
8532 /*
8533 * Store the updated delta limit in the proc.
8534 */
8535 p->p_memlimit_increase = page_aligned_increase;
8536
8537 int error = memorystatus_set_memlimit_properties_internal(p, &mmp_entry);
8538
8539 proc_list_unlock();
8540 proc_rele(p);
8541
8542 return error;
8543 }
8544 #endif /* DEVELOPMENT */
8545 #endif /* CONFIG_JETSAM */