]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/kern_memorystatus.c
54e431d0556de2354a2ed1975806a179e0d0e7ad
[apple/xnu.git] / bsd / kern / kern_memorystatus.c
1 /*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
29
30 #include <kern/sched_prim.h>
31 #include <kern/kalloc.h>
32 #include <kern/assert.h>
33 #include <kern/debug.h>
34 #include <kern/locks.h>
35 #include <kern/task.h>
36 #include <kern/thread.h>
37 #include <kern/host.h>
38 #include <kern/policy_internal.h>
39 #include <kern/thread_group.h>
40
41 #include <IOKit/IOBSD.h>
42
43 #include <libkern/libkern.h>
44 #include <mach/coalition.h>
45 #include <mach/mach_time.h>
46 #include <mach/task.h>
47 #include <mach/host_priv.h>
48 #include <mach/mach_host.h>
49 #include <os/log.h>
50 #include <pexpert/pexpert.h>
51 #include <sys/coalition.h>
52 #include <sys/kern_event.h>
53 #include <sys/proc.h>
54 #include <sys/proc_info.h>
55 #include <sys/reason.h>
56 #include <sys/signal.h>
57 #include <sys/signalvar.h>
58 #include <sys/sysctl.h>
59 #include <sys/sysproto.h>
60 #include <sys/wait.h>
61 #include <sys/tree.h>
62 #include <sys/priv.h>
63 #include <vm/vm_pageout.h>
64 #include <vm/vm_protos.h>
65
66 #if CONFIG_FREEZE
67 #include <vm/vm_map.h>
68 #endif /* CONFIG_FREEZE */
69
70 #include <sys/kern_memorystatus.h>
71
72 #include <mach/machine/sdt.h>
73 #include <libkern/section_keywords.h>
74
75 /* For logging clarity */
76 static const char *memorystatus_kill_cause_name[] = {
77 "" ,
78 "jettisoned" , /* kMemorystatusKilled */
79 "highwater" , /* kMemorystatusKilledHiwat */
80 "vnode-limit" , /* kMemorystatusKilledVnodes */
81 "vm-pageshortage" , /* kMemorystatusKilledVMPageShortage */
82 "vm-thrashing" , /* kMemorystatusKilledVMThrashing */
83 "fc-thrashing" , /* kMemorystatusKilledFCThrashing */
84 "per-process-limit" , /* kMemorystatusKilledPerProcessLimit */
85 "diagnostic" , /* kMemorystatusKilledDiagnostic */
86 "idle-exit" , /* kMemorystatusKilledIdleExit */
87 "zone-map-exhaustion" , /* kMemorystatusKilledZoneMapExhaustion */
88 };
89
90 static const char *
91 memorystatus_priority_band_name(int32_t priority)
92 {
93 switch (priority) {
94 case JETSAM_PRIORITY_FOREGROUND:
95 return "FOREGROUND";
96 case JETSAM_PRIORITY_AUDIO_AND_ACCESSORY:
97 return "AUDIO_AND_ACCESSORY";
98 case JETSAM_PRIORITY_CONDUCTOR:
99 return "CONDUCTOR";
100 case JETSAM_PRIORITY_HOME:
101 return "HOME";
102 case JETSAM_PRIORITY_EXECUTIVE:
103 return "EXECUTIVE";
104 case JETSAM_PRIORITY_IMPORTANT:
105 return "IMPORTANT";
106 case JETSAM_PRIORITY_CRITICAL:
107 return "CRITICAL";
108 }
109
110 return ("?");
111 }
112
113 /* Does cause indicate vm or fc thrashing? */
114 static boolean_t
115 is_reason_thrashing(unsigned cause)
116 {
117 switch (cause) {
118 case kMemorystatusKilledVMThrashing:
119 case kMemorystatusKilledFCThrashing:
120 return TRUE;
121 default:
122 return FALSE;
123 }
124 }
125
126 /* Is the zone map almost full? */
127 static boolean_t
128 is_reason_zone_map_exhaustion(unsigned cause)
129 {
130 if (cause == kMemorystatusKilledZoneMapExhaustion)
131 return TRUE;
132 return FALSE;
133 }
134
135 /*
136 * Returns the current zone map size and capacity to include in the jetsam snapshot.
137 * Defined in zalloc.c
138 */
139 extern void get_zone_map_size(uint64_t *current_size, uint64_t *capacity);
140
141 /*
142 * Returns the name of the largest zone and its size to include in the jetsam snapshot.
143 * Defined in zalloc.c
144 */
145 extern void get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size);
146
147 /* These are very verbose printfs(), enable with
148 * MEMORYSTATUS_DEBUG_LOG
149 */
150 #if MEMORYSTATUS_DEBUG_LOG
151 #define MEMORYSTATUS_DEBUG(cond, format, ...) \
152 do { \
153 if (cond) { printf(format, ##__VA_ARGS__); } \
154 } while(0)
155 #else
156 #define MEMORYSTATUS_DEBUG(cond, format, ...)
157 #endif
158
159 /*
160 * Active / Inactive limit support
161 * proc list must be locked
162 *
163 * The SET_*** macros are used to initialize a limit
164 * for the first time.
165 *
166 * The CACHE_*** macros are use to cache the limit that will
167 * soon be in effect down in the ledgers.
168 */
169
170 #define SET_ACTIVE_LIMITS_LOCKED(p, limit, is_fatal) \
171 MACRO_BEGIN \
172 (p)->p_memstat_memlimit_active = (limit); \
173 if (is_fatal) { \
174 (p)->p_memstat_state |= P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL; \
175 } else { \
176 (p)->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL; \
177 } \
178 MACRO_END
179
180 #define SET_INACTIVE_LIMITS_LOCKED(p, limit, is_fatal) \
181 MACRO_BEGIN \
182 (p)->p_memstat_memlimit_inactive = (limit); \
183 if (is_fatal) { \
184 (p)->p_memstat_state |= P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL; \
185 } else { \
186 (p)->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL; \
187 } \
188 MACRO_END
189
190 #define CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal) \
191 MACRO_BEGIN \
192 (p)->p_memstat_memlimit = (p)->p_memstat_memlimit_active; \
193 if ((p)->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL) { \
194 (p)->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT; \
195 is_fatal = TRUE; \
196 } else { \
197 (p)->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT; \
198 is_fatal = FALSE; \
199 } \
200 MACRO_END
201
202 #define CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal) \
203 MACRO_BEGIN \
204 (p)->p_memstat_memlimit = (p)->p_memstat_memlimit_inactive; \
205 if ((p)->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) { \
206 (p)->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT; \
207 is_fatal = TRUE; \
208 } else { \
209 (p)->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT; \
210 is_fatal = FALSE; \
211 } \
212 MACRO_END
213
214
215 /* General tunables */
216
217 unsigned long delta_percentage = 5;
218 unsigned long critical_threshold_percentage = 5;
219 unsigned long idle_offset_percentage = 5;
220 unsigned long pressure_threshold_percentage = 15;
221 unsigned long freeze_threshold_percentage = 50;
222 unsigned long policy_more_free_offset_percentage = 5;
223
224 /* General memorystatus stuff */
225
226 struct klist memorystatus_klist;
227 static lck_mtx_t memorystatus_klist_mutex;
228
229 static void memorystatus_klist_lock(void);
230 static void memorystatus_klist_unlock(void);
231
232 static uint64_t memorystatus_sysprocs_idle_delay_time = 0;
233 static uint64_t memorystatus_apps_idle_delay_time = 0;
234
235 /*
236 * Memorystatus kevents
237 */
238
239 static int filt_memorystatusattach(struct knote *kn, struct kevent_internal_s *kev);
240 static void filt_memorystatusdetach(struct knote *kn);
241 static int filt_memorystatus(struct knote *kn, long hint);
242 static int filt_memorystatustouch(struct knote *kn, struct kevent_internal_s *kev);
243 static int filt_memorystatusprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
244
245 SECURITY_READ_ONLY_EARLY(struct filterops) memorystatus_filtops = {
246 .f_attach = filt_memorystatusattach,
247 .f_detach = filt_memorystatusdetach,
248 .f_event = filt_memorystatus,
249 .f_touch = filt_memorystatustouch,
250 .f_process = filt_memorystatusprocess,
251 };
252
253 enum {
254 kMemorystatusNoPressure = 0x1,
255 kMemorystatusPressure = 0x2,
256 kMemorystatusLowSwap = 0x4,
257 kMemorystatusProcLimitWarn = 0x8,
258 kMemorystatusProcLimitCritical = 0x10
259 };
260
261 /* Idle guard handling */
262
263 static int32_t memorystatus_scheduled_idle_demotions_sysprocs = 0;
264 static int32_t memorystatus_scheduled_idle_demotions_apps = 0;
265
266 static thread_call_t memorystatus_idle_demotion_call;
267
268 static void memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2);
269 static void memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state);
270 static void memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clean_state);
271 static void memorystatus_reschedule_idle_demotion_locked(void);
272
273 static void memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert, boolean_t skip_demotion_check);
274
275 int memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap);
276
277 vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
278
279 boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
280 void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
281 void memorystatus_send_low_swap_note(void);
282
283 int memorystatus_wakeup = 0;
284
285 unsigned int memorystatus_level = 0;
286
287 static int memorystatus_list_count = 0;
288
289 #define MEMSTAT_BUCKET_COUNT (JETSAM_PRIORITY_MAX + 1)
290
291 typedef struct memstat_bucket {
292 TAILQ_HEAD(, proc) list;
293 int count;
294 } memstat_bucket_t;
295
296 memstat_bucket_t memstat_bucket[MEMSTAT_BUCKET_COUNT];
297
298 int memorystatus_get_proccnt_upto_priority(int32_t max_bucket_index);
299
300 uint64_t memstat_idle_demotion_deadline = 0;
301
302 int system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
303 int applications_aging_band = JETSAM_PRIORITY_IDLE;
304
305 #define isProcessInAgingBands(p) ((isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) || (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)))
306 #define isApp(p) (! (p->p_memstat_dirty & P_DIRTY_TRACK))
307 #define isSysProc(p) ((p->p_memstat_dirty & P_DIRTY_TRACK))
308
309 #define kJetsamAgingPolicyNone (0)
310 #define kJetsamAgingPolicyLegacy (1)
311 #define kJetsamAgingPolicySysProcsReclaimedFirst (2)
312 #define kJetsamAgingPolicyAppsReclaimedFirst (3)
313 #define kJetsamAgingPolicyMax kJetsamAgingPolicyAppsReclaimedFirst
314
315 unsigned int jetsam_aging_policy = kJetsamAgingPolicyLegacy;
316
317 extern int corpse_for_fatal_memkill;
318 extern unsigned long total_corpses_count(void) __attribute__((pure));
319 extern void task_purge_all_corpses(void);
320 boolean_t memorystatus_allowed_vm_map_fork(__unused task_t);
321
322 #if 0
323
324 /* Keeping around for future use if we need a utility that can do this OR an app that needs a dynamic adjustment. */
325
326 static int
327 sysctl_set_jetsam_aging_policy SYSCTL_HANDLER_ARGS
328 {
329 #pragma unused(oidp, arg1, arg2)
330
331 int error = 0, val = 0;
332 memstat_bucket_t *old_bucket = 0;
333 int old_system_procs_aging_band = 0, new_system_procs_aging_band = 0;
334 int old_applications_aging_band = 0, new_applications_aging_band = 0;
335 proc_t p = NULL, next_proc = NULL;
336
337
338 error = sysctl_io_number(req, jetsam_aging_policy, sizeof(int), &val, NULL);
339 if (error || !req->newptr) {
340 return (error);
341 }
342
343 if ((val < 0) || (val > kJetsamAgingPolicyMax)) {
344 printf("jetsam: ordering policy sysctl has invalid value - %d\n", val);
345 return EINVAL;
346 }
347
348 /*
349 * We need to synchronize with any potential adding/removal from aging bands
350 * that might be in progress currently. We use the proc_list_lock() just for
351 * consistency with all the routines dealing with 'aging' processes. We need
352 * a lighterweight lock.
353 */
354 proc_list_lock();
355
356 old_system_procs_aging_band = system_procs_aging_band;
357 old_applications_aging_band = applications_aging_band;
358
359 switch (val) {
360
361 case kJetsamAgingPolicyNone:
362 new_system_procs_aging_band = JETSAM_PRIORITY_IDLE;
363 new_applications_aging_band = JETSAM_PRIORITY_IDLE;
364 break;
365
366 case kJetsamAgingPolicyLegacy:
367 /*
368 * Legacy behavior where some daemons get a 10s protection once and only before the first clean->dirty->clean transition before going into IDLE band.
369 */
370 new_system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
371 new_applications_aging_band = JETSAM_PRIORITY_IDLE;
372 break;
373
374 case kJetsamAgingPolicySysProcsReclaimedFirst:
375 new_system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
376 new_applications_aging_band = JETSAM_PRIORITY_AGING_BAND2;
377 break;
378
379 case kJetsamAgingPolicyAppsReclaimedFirst:
380 new_system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND2;
381 new_applications_aging_band = JETSAM_PRIORITY_AGING_BAND1;
382 break;
383
384 default:
385 break;
386 }
387
388 if (old_system_procs_aging_band && (old_system_procs_aging_band != new_system_procs_aging_band)) {
389
390 old_bucket = &memstat_bucket[old_system_procs_aging_band];
391 p = TAILQ_FIRST(&old_bucket->list);
392
393 while (p) {
394
395 next_proc = TAILQ_NEXT(p, p_memstat_list);
396
397 if (isSysProc(p)) {
398 if (new_system_procs_aging_band == JETSAM_PRIORITY_IDLE) {
399 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
400 }
401
402 memorystatus_update_priority_locked(p, new_system_procs_aging_band, false, true);
403 }
404
405 p = next_proc;
406 continue;
407 }
408 }
409
410 if (old_applications_aging_band && (old_applications_aging_band != new_applications_aging_band)) {
411
412 old_bucket = &memstat_bucket[old_applications_aging_band];
413 p = TAILQ_FIRST(&old_bucket->list);
414
415 while (p) {
416
417 next_proc = TAILQ_NEXT(p, p_memstat_list);
418
419 if (isApp(p)) {
420 if (new_applications_aging_band == JETSAM_PRIORITY_IDLE) {
421 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
422 }
423
424 memorystatus_update_priority_locked(p, new_applications_aging_band, false, true);
425 }
426
427 p = next_proc;
428 continue;
429 }
430 }
431
432 jetsam_aging_policy = val;
433 system_procs_aging_band = new_system_procs_aging_band;
434 applications_aging_band = new_applications_aging_band;
435
436 proc_list_unlock();
437
438 return (0);
439 }
440
441 SYSCTL_PROC(_kern, OID_AUTO, set_jetsam_aging_policy, CTLTYPE_INT|CTLFLAG_RW,
442 0, 0, sysctl_set_jetsam_aging_policy, "I", "Jetsam Aging Policy");
443 #endif /*0*/
444
445 static int
446 sysctl_jetsam_set_sysprocs_idle_delay_time SYSCTL_HANDLER_ARGS
447 {
448 #pragma unused(oidp, arg1, arg2)
449
450 int error = 0, val = 0, old_time_in_secs = 0;
451 uint64_t old_time_in_ns = 0;
452
453 absolutetime_to_nanoseconds(memorystatus_sysprocs_idle_delay_time, &old_time_in_ns);
454 old_time_in_secs = old_time_in_ns / NSEC_PER_SEC;
455
456 error = sysctl_io_number(req, old_time_in_secs, sizeof(int), &val, NULL);
457 if (error || !req->newptr) {
458 return (error);
459 }
460
461 if ((val < 0) || (val > INT32_MAX)) {
462 printf("jetsam: new idle delay interval has invalid value.\n");
463 return EINVAL;
464 }
465
466 nanoseconds_to_absolutetime((uint64_t)val * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time);
467
468 return(0);
469 }
470
471 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_sysprocs_idle_delay_time, CTLTYPE_INT|CTLFLAG_RW,
472 0, 0, sysctl_jetsam_set_sysprocs_idle_delay_time, "I", "Aging window for system processes");
473
474
475 static int
476 sysctl_jetsam_set_apps_idle_delay_time SYSCTL_HANDLER_ARGS
477 {
478 #pragma unused(oidp, arg1, arg2)
479
480 int error = 0, val = 0, old_time_in_secs = 0;
481 uint64_t old_time_in_ns = 0;
482
483 absolutetime_to_nanoseconds(memorystatus_apps_idle_delay_time, &old_time_in_ns);
484 old_time_in_secs = old_time_in_ns / NSEC_PER_SEC;
485
486 error = sysctl_io_number(req, old_time_in_secs, sizeof(int), &val, NULL);
487 if (error || !req->newptr) {
488 return (error);
489 }
490
491 if ((val < 0) || (val > INT32_MAX)) {
492 printf("jetsam: new idle delay interval has invalid value.\n");
493 return EINVAL;
494 }
495
496 nanoseconds_to_absolutetime((uint64_t)val * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time);
497
498 return(0);
499 }
500
501 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_apps_idle_delay_time, CTLTYPE_INT|CTLFLAG_RW,
502 0, 0, sysctl_jetsam_set_apps_idle_delay_time, "I", "Aging window for applications");
503
504 SYSCTL_INT(_kern, OID_AUTO, jetsam_aging_policy, CTLTYPE_INT|CTLFLAG_RD, &jetsam_aging_policy, 0, "");
505
506 static unsigned int memorystatus_dirty_count = 0;
507
508 SYSCTL_INT(_kern, OID_AUTO, max_task_pmem, CTLFLAG_RD|CTLFLAG_LOCKED|CTLFLAG_MASKED, &max_task_footprint_mb, 0, "");
509
510 #if CONFIG_EMBEDDED
511
512 SYSCTL_INT(_kern, OID_AUTO, memorystatus_level, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_level, 0, "");
513
514 #endif /* CONFIG_EMBEDDED */
515
516 int
517 memorystatus_get_level(__unused struct proc *p, struct memorystatus_get_level_args *args, __unused int *ret)
518 {
519 user_addr_t level = 0;
520
521 level = args->level;
522
523 if (copyout(&memorystatus_level, level, sizeof(memorystatus_level)) != 0) {
524 return EFAULT;
525 }
526
527 return 0;
528 }
529
530 static proc_t memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search);
531 static proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search);
532
533 static void memorystatus_thread(void *param __unused, wait_result_t wr __unused);
534
535 /* Memory Limits */
536
537 static int memorystatus_highwater_enabled = 1; /* Update the cached memlimit data. */
538
539 static boolean_t proc_jetsam_state_is_active_locked(proc_t);
540 static boolean_t memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason);
541 static boolean_t memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason);
542
543
544 static int memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
545
546 static int memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry);
547
548 static int memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
549
550 static int memorystatus_cmd_get_memlimit_excess_np(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
551
552 int proc_get_memstat_priority(proc_t, boolean_t);
553
554 static boolean_t memorystatus_idle_snapshot = 0;
555
556 unsigned int memorystatus_delta = 0;
557
558 /* Jetsam Loop Detection */
559 static boolean_t memorystatus_jld_enabled = FALSE; /* Enable jetsam loop detection */
560 static uint32_t memorystatus_jld_eval_period_msecs = 0; /* Init pass sets this based on device memory size */
561 static int memorystatus_jld_eval_aggressive_count = 3; /* Raise the priority max after 'n' aggressive loops */
562 static int memorystatus_jld_eval_aggressive_priority_band_max = 15; /* Kill aggressively up through this band */
563
564 /*
565 * A FG app can request that the aggressive jetsam mechanism display some leniency in the FG band. This 'lenient' mode is described as:
566 * --- if aggressive jetsam kills an app in the FG band and gets back >=AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD memory, it will stop the aggressive march further into and up the jetsam bands.
567 *
568 * RESTRICTIONS:
569 * - Such a request is respected/acknowledged only once while that 'requesting' app is in the FG band i.e. if aggressive jetsam was
570 * needed and the 'lenient' mode was deployed then that's it for this special mode while the app is in the FG band.
571 *
572 * - If the app is still in the FG band and aggressive jetsam is needed again, there will be no stop-and-check the next time around.
573 *
574 * - Also, the transition of the 'requesting' app away from the FG band will void this special behavior.
575 */
576
577 #define AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD 25
578 boolean_t memorystatus_aggressive_jetsam_lenient_allowed = FALSE;
579 boolean_t memorystatus_aggressive_jetsam_lenient = FALSE;
580
581 #if DEVELOPMENT || DEBUG
582 /*
583 * Jetsam Loop Detection tunables.
584 */
585
586 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_period_msecs, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_jld_eval_period_msecs, 0, "");
587 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_aggressive_count, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_jld_eval_aggressive_count, 0, "");
588 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_aggressive_priority_band_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_jld_eval_aggressive_priority_band_max, 0, "");
589 #endif /* DEVELOPMENT || DEBUG */
590
591 static uint32_t kill_under_pressure_cause = 0;
592
593 /*
594 * default jetsam snapshot support
595 */
596 static memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot;
597 #define memorystatus_jetsam_snapshot_list memorystatus_jetsam_snapshot->entries
598 static unsigned int memorystatus_jetsam_snapshot_count = 0;
599 static unsigned int memorystatus_jetsam_snapshot_max = 0;
600 static uint64_t memorystatus_jetsam_snapshot_last_timestamp = 0;
601 static uint64_t memorystatus_jetsam_snapshot_timeout = 0;
602 #define JETSAM_SNAPSHOT_TIMEOUT_SECS 30
603
604 /*
605 * snapshot support for memstats collected at boot.
606 */
607 static memorystatus_jetsam_snapshot_t memorystatus_at_boot_snapshot;
608
609 static void memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count);
610 static boolean_t memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount);
611 static void memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime);
612
613 static void memorystatus_clear_errors(void);
614 static void memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages);
615 static void memorystatus_get_task_phys_footprint_page_counts(task_t task,
616 uint64_t *internal_pages, uint64_t *internal_compressed_pages,
617 uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages,
618 uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages,
619 uint64_t *iokit_mapped_pages, uint64_t *page_table_pages);
620
621 static void memorystatus_get_task_memory_region_count(task_t task, uint64_t *count);
622
623 static uint32_t memorystatus_build_state(proc_t p);
624 //static boolean_t memorystatus_issue_pressure_kevent(boolean_t pressured);
625
626 static boolean_t memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, os_reason_t jetsam_reason, int32_t *priority, uint32_t *errors);
627 static boolean_t memorystatus_kill_top_process_aggressive(uint32_t cause, int aggr_count, int32_t priority_max, uint32_t *errors);
628 static boolean_t memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, int aggr_count, uint32_t *errors);
629 static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors);
630
631 static boolean_t memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause);
632
633 /* Priority Band Sorting Routines */
634 static int memorystatus_sort_bucket(unsigned int bucket_index, int sort_order);
635 static int memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coal_sort_order);
636 static void memorystatus_sort_by_largest_process_locked(unsigned int bucket_index);
637 static int memorystatus_move_list_locked(unsigned int bucket_index, pid_t *pid_list, int list_sz);
638
639 /* qsort routines */
640 typedef int (*cmpfunc_t)(const void *a, const void *b);
641 extern void qsort(void *a, size_t n, size_t es, cmpfunc_t cmp);
642 static int memstat_asc_cmp(const void *a, const void *b);
643
644 /* VM pressure */
645
646 extern unsigned int vm_page_free_count;
647 extern unsigned int vm_page_active_count;
648 extern unsigned int vm_page_inactive_count;
649 extern unsigned int vm_page_throttled_count;
650 extern unsigned int vm_page_purgeable_count;
651 extern unsigned int vm_page_wire_count;
652 #if CONFIG_SECLUDED_MEMORY
653 extern unsigned int vm_page_secluded_count;
654 #endif /* CONFIG_SECLUDED_MEMORY */
655
656 #if CONFIG_JETSAM
657 unsigned int memorystatus_available_pages = (unsigned int)-1;
658 unsigned int memorystatus_available_pages_pressure = 0;
659 unsigned int memorystatus_available_pages_critical = 0;
660 static unsigned int memorystatus_available_pages_critical_base = 0;
661 static unsigned int memorystatus_available_pages_critical_idle_offset = 0;
662
663 #if DEVELOPMENT || DEBUG
664 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
665 #else
666 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
667 #endif /* DEVELOPMENT || DEBUG */
668
669 static unsigned int memorystatus_jetsam_policy = kPolicyDefault;
670 unsigned int memorystatus_policy_more_free_offset_pages = 0;
671 static void memorystatus_update_levels_locked(boolean_t critical_only);
672 static unsigned int memorystatus_thread_wasted_wakeup = 0;
673
674 /* Callback into vm_compressor.c to signal that thrashing has been mitigated. */
675 extern void vm_thrashing_jetsam_done(void);
676 static int memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit);
677
678 int32_t max_kill_priority = JETSAM_PRIORITY_MAX;
679
680 #else /* CONFIG_JETSAM */
681
682 uint64_t memorystatus_available_pages = (uint64_t)-1;
683 uint64_t memorystatus_available_pages_pressure = (uint64_t)-1;
684 uint64_t memorystatus_available_pages_critical = (uint64_t)-1;
685
686 int32_t max_kill_priority = JETSAM_PRIORITY_IDLE;
687 #endif /* CONFIG_JETSAM */
688
689 unsigned int memorystatus_frozen_count = 0;
690 unsigned int memorystatus_suspended_count = 0;
691
692 #if VM_PRESSURE_EVENTS
693
694 boolean_t memorystatus_warn_process(pid_t pid, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t exceeded);
695
696 vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal;
697
698 /*
699 * We use this flag to signal if we have any HWM offenders
700 * on the system. This way we can reduce the number of wakeups
701 * of the memorystatus_thread when the system is between the
702 * "pressure" and "critical" threshold.
703 *
704 * The (re-)setting of this variable is done without any locks
705 * or synchronization simply because it is not possible (currently)
706 * to keep track of HWM offenders that drop down below their memory
707 * limit and/or exit. So, we choose to burn a couple of wasted wakeups
708 * by allowing the unguarded modification of this variable.
709 */
710 boolean_t memorystatus_hwm_candidates = 0;
711
712 static int memorystatus_send_note(int event_code, void *data, size_t data_length);
713
714 #endif /* VM_PRESSURE_EVENTS */
715
716
717 #if DEVELOPMENT || DEBUG
718
719 lck_grp_attr_t *disconnect_page_mappings_lck_grp_attr;
720 lck_grp_t *disconnect_page_mappings_lck_grp;
721 static lck_mtx_t disconnect_page_mappings_mutex;
722
723 extern boolean_t kill_on_no_paging_space;
724 #endif /* DEVELOPMENT || DEBUG */
725
726
727 /* Freeze */
728
729 #if CONFIG_FREEZE
730
731 boolean_t memorystatus_freeze_enabled = FALSE;
732 int memorystatus_freeze_wakeup = 0;
733
734 lck_grp_attr_t *freezer_lck_grp_attr;
735 lck_grp_t *freezer_lck_grp;
736 static lck_mtx_t freezer_mutex;
737
738 static inline boolean_t memorystatus_can_freeze_processes(void);
739 static boolean_t memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low);
740
741 static void memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused);
742
743 /* Thresholds */
744 static unsigned int memorystatus_freeze_threshold = 0;
745
746 static unsigned int memorystatus_freeze_pages_min = 0;
747 static unsigned int memorystatus_freeze_pages_max = 0;
748
749 static unsigned int memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT;
750
751 static unsigned int memorystatus_freeze_daily_mb_max = FREEZE_DAILY_MB_MAX_DEFAULT;
752
753 /* Stats */
754 static uint64_t memorystatus_freeze_count = 0;
755 static uint64_t memorystatus_freeze_pageouts = 0;
756
757 /* Throttling */
758 static throttle_interval_t throttle_intervals[] = {
759 { 60, 8, 0, 0, { 0, 0 }, FALSE }, /* 1 hour intermediate interval, 8x burst */
760 { 24 * 60, 1, 0, 0, { 0, 0 }, FALSE }, /* 24 hour long interval, no burst */
761 };
762
763 static uint64_t memorystatus_freeze_throttle_count = 0;
764
765 static unsigned int memorystatus_suspended_footprint_total = 0; /* pages */
766
767 extern uint64_t vm_swap_get_free_space(void);
768
769 static boolean_t memorystatus_freeze_update_throttle(void);
770
771 #endif /* CONFIG_FREEZE */
772
773 /* Debug */
774
775 extern struct knote *vm_find_knote_from_pid(pid_t, struct klist *);
776
777 #if DEVELOPMENT || DEBUG
778
779 static unsigned int memorystatus_debug_dump_this_bucket = 0;
780
781 static void
782 memorystatus_debug_dump_bucket_locked (unsigned int bucket_index)
783 {
784 proc_t p = NULL;
785 uint64_t bytes = 0;
786 int ledger_limit = 0;
787 unsigned int b = bucket_index;
788 boolean_t traverse_all_buckets = FALSE;
789
790 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
791 traverse_all_buckets = TRUE;
792 b = 0;
793 } else {
794 traverse_all_buckets = FALSE;
795 b = bucket_index;
796 }
797
798 /*
799 * footprint reported in [pages / MB ]
800 * limits reported as:
801 * L-limit proc's Ledger limit
802 * C-limit proc's Cached limit, should match Ledger
803 * A-limit proc's Active limit
804 * IA-limit proc's Inactive limit
805 * F==Fatal, NF==NonFatal
806 */
807
808 printf("memorystatus_debug_dump ***START*(PAGE_SIZE_64=%llu)**\n", PAGE_SIZE_64);
809 printf("bucket [pid] [pages / MB] [state] [EP / RP] dirty deadline [L-limit / C-limit / A-limit / IA-limit] name\n");
810 p = memorystatus_get_first_proc_locked(&b, traverse_all_buckets);
811 while (p) {
812 bytes = get_task_phys_footprint(p->task);
813 task_get_phys_footprint_limit(p->task, &ledger_limit);
814 printf("%2d [%5d] [%5lld /%3lldMB] 0x%-8x [%2d / %2d] 0x%-3x %10lld [%3d / %3d%s / %3d%s / %3d%s] %s\n",
815 b, p->p_pid,
816 (bytes / PAGE_SIZE_64), /* task's footprint converted from bytes to pages */
817 (bytes / (1024ULL * 1024ULL)), /* task's footprint converted from bytes to MB */
818 p->p_memstat_state, p->p_memstat_effectivepriority, p->p_memstat_requestedpriority, p->p_memstat_dirty, p->p_memstat_idledeadline,
819 ledger_limit,
820 p->p_memstat_memlimit,
821 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"),
822 p->p_memstat_memlimit_active,
823 (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL ? "F " : "NF"),
824 p->p_memstat_memlimit_inactive,
825 (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL ? "F " : "NF"),
826 (*p->p_name ? p->p_name : "unknown"));
827 p = memorystatus_get_next_proc_locked(&b, p, traverse_all_buckets);
828 }
829 printf("memorystatus_debug_dump ***END***\n");
830 }
831
832 static int
833 sysctl_memorystatus_debug_dump_bucket SYSCTL_HANDLER_ARGS
834 {
835 #pragma unused(oidp, arg2)
836 int bucket_index = 0;
837 int error;
838 error = SYSCTL_OUT(req, arg1, sizeof(int));
839 if (error || !req->newptr) {
840 return (error);
841 }
842 error = SYSCTL_IN(req, &bucket_index, sizeof(int));
843 if (error || !req->newptr) {
844 return (error);
845 }
846 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
847 /*
848 * All jetsam buckets will be dumped.
849 */
850 } else {
851 /*
852 * Only a single bucket will be dumped.
853 */
854 }
855
856 proc_list_lock();
857 memorystatus_debug_dump_bucket_locked(bucket_index);
858 proc_list_unlock();
859 memorystatus_debug_dump_this_bucket = bucket_index;
860 return (error);
861 }
862
863 /*
864 * Debug aid to look at jetsam buckets and proc jetsam fields.
865 * Use this sysctl to act on a particular jetsam bucket.
866 * Writing the sysctl triggers the dump.
867 * Usage: sysctl kern.memorystatus_debug_dump_this_bucket=<bucket_index>
868 */
869
870 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_debug_dump_this_bucket, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_debug_dump_this_bucket, 0, sysctl_memorystatus_debug_dump_bucket, "I", "");
871
872
873 /* Debug aid to aid determination of limit */
874
875 static int
876 sysctl_memorystatus_highwater_enable SYSCTL_HANDLER_ARGS
877 {
878 #pragma unused(oidp, arg2)
879 proc_t p;
880 unsigned int b = 0;
881 int error, enable = 0;
882 boolean_t use_active; /* use the active limit and active limit attributes */
883 boolean_t is_fatal;
884
885 error = SYSCTL_OUT(req, arg1, sizeof(int));
886 if (error || !req->newptr) {
887 return (error);
888 }
889
890 error = SYSCTL_IN(req, &enable, sizeof(int));
891 if (error || !req->newptr) {
892 return (error);
893 }
894
895 if (!(enable == 0 || enable == 1)) {
896 return EINVAL;
897 }
898
899 proc_list_lock();
900
901 p = memorystatus_get_first_proc_locked(&b, TRUE);
902 while (p) {
903 use_active = proc_jetsam_state_is_active_locked(p);
904
905 if (enable) {
906
907 if (use_active == TRUE) {
908 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
909 } else {
910 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
911 }
912
913 } else {
914 /*
915 * Disabling limits does not touch the stored variants.
916 * Set the cached limit fields to system_wide defaults.
917 */
918 p->p_memstat_memlimit = -1;
919 p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
920 is_fatal = TRUE;
921 }
922
923 /*
924 * Enforce the cached limit by writing to the ledger.
925 */
926 task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit: -1, NULL, use_active, is_fatal);
927
928 p = memorystatus_get_next_proc_locked(&b, p, TRUE);
929 }
930
931 memorystatus_highwater_enabled = enable;
932
933 proc_list_unlock();
934
935 return 0;
936
937 }
938
939 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_highwater_enabled, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_highwater_enabled, 0, sysctl_memorystatus_highwater_enable, "I", "");
940
941 #if VM_PRESSURE_EVENTS
942
943 /*
944 * This routine is used for targeted notifications regardless of system memory pressure
945 * and regardless of whether or not the process has already been notified.
946 * It bypasses and has no effect on the only-one-notification per soft-limit policy.
947 *
948 * "memnote" is the current user.
949 */
950
951 static int
952 sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS
953 {
954 #pragma unused(arg1, arg2)
955
956 int error = 0, pid = 0;
957 struct knote *kn = NULL;
958 boolean_t found_knote = FALSE;
959 int fflags = 0; /* filter flags for EVFILT_MEMORYSTATUS */
960 uint64_t value = 0;
961
962 error = sysctl_handle_quad(oidp, &value, 0, req);
963 if (error || !req->newptr)
964 return (error);
965
966 /*
967 * Find the pid in the low 32 bits of value passed in.
968 */
969 pid = (int)(value & 0xFFFFFFFF);
970
971 /*
972 * Find notification in the high 32 bits of the value passed in.
973 */
974 fflags = (int)((value >> 32) & 0xFFFFFFFF);
975
976 /*
977 * For backwards compatibility, when no notification is
978 * passed in, default to the NOTE_MEMORYSTATUS_PRESSURE_WARN
979 */
980 if (fflags == 0) {
981 fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
982 // printf("memorystatus_vm_pressure_send: using default notification [0x%x]\n", fflags);
983 }
984
985 /*
986 * See event.h ... fflags for EVFILT_MEMORYSTATUS
987 */
988 if (!((fflags == NOTE_MEMORYSTATUS_PRESSURE_NORMAL)||
989 (fflags == NOTE_MEMORYSTATUS_PRESSURE_WARN) ||
990 (fflags == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) ||
991 (fflags == NOTE_MEMORYSTATUS_LOW_SWAP) ||
992 (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) ||
993 (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) ||
994 (((fflags & NOTE_MEMORYSTATUS_MSL_STATUS) != 0 &&
995 ((fflags & ~NOTE_MEMORYSTATUS_MSL_STATUS) == 0))))) {
996
997 printf("memorystatus_vm_pressure_send: notification [0x%x] not supported \n", fflags);
998 error = 1;
999 return (error);
1000 }
1001
1002 /*
1003 * Forcibly send pid a memorystatus notification.
1004 */
1005
1006 memorystatus_klist_lock();
1007
1008 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1009 proc_t knote_proc = knote_get_kq(kn)->kq_p;
1010 pid_t knote_pid = knote_proc->p_pid;
1011
1012 if (knote_pid == pid) {
1013 /*
1014 * Forcibly send this pid a memorystatus notification.
1015 */
1016 kn->kn_fflags = fflags;
1017 found_knote = TRUE;
1018 }
1019 }
1020
1021 if (found_knote) {
1022 KNOTE(&memorystatus_klist, 0);
1023 printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] sent to process [%d] \n", value, fflags, pid);
1024 error = 0;
1025 } else {
1026 printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] not sent to process [%d] (none registered?)\n", value, fflags, pid);
1027 error = 1;
1028 }
1029
1030 memorystatus_klist_unlock();
1031
1032 return (error);
1033 }
1034
1035 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_QUAD|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
1036 0, 0, &sysctl_memorystatus_vm_pressure_send, "Q", "");
1037
1038 #endif /* VM_PRESSURE_EVENTS */
1039
1040 SYSCTL_INT(_kern, OID_AUTO, memorystatus_idle_snapshot, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_idle_snapshot, 0, "");
1041
1042 #if CONFIG_JETSAM
1043 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_available_pages_critical, 0, "");
1044 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_base, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_critical_base, 0, "");
1045 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_idle_offset, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_critical_idle_offset, 0, "");
1046 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_policy_more_free_offset_pages, CTLFLAG_RW, &memorystatus_policy_more_free_offset_pages, 0, "");
1047
1048 static unsigned int memorystatus_jetsam_panic_debug = 0;
1049 static unsigned int memorystatus_jetsam_policy_offset_pages_diagnostic = 0;
1050
1051 /* Diagnostic code */
1052
1053 enum {
1054 kJetsamDiagnosticModeNone = 0,
1055 kJetsamDiagnosticModeAll = 1,
1056 kJetsamDiagnosticModeStopAtFirstActive = 2,
1057 kJetsamDiagnosticModeCount
1058 } jetsam_diagnostic_mode = kJetsamDiagnosticModeNone;
1059
1060 static int jetsam_diagnostic_suspended_one_active_proc = 0;
1061
1062 static int
1063 sysctl_jetsam_diagnostic_mode SYSCTL_HANDLER_ARGS
1064 {
1065 #pragma unused(arg1, arg2)
1066
1067 const char *diagnosticStrings[] = {
1068 "jetsam: diagnostic mode: resetting critical level.",
1069 "jetsam: diagnostic mode: will examine all processes",
1070 "jetsam: diagnostic mode: will stop at first active process"
1071 };
1072
1073 int error, val = jetsam_diagnostic_mode;
1074 boolean_t changed = FALSE;
1075
1076 error = sysctl_handle_int(oidp, &val, 0, req);
1077 if (error || !req->newptr)
1078 return (error);
1079 if ((val < 0) || (val >= kJetsamDiagnosticModeCount)) {
1080 printf("jetsam: diagnostic mode: invalid value - %d\n", val);
1081 return EINVAL;
1082 }
1083
1084 proc_list_lock();
1085
1086 if ((unsigned int) val != jetsam_diagnostic_mode) {
1087 jetsam_diagnostic_mode = val;
1088
1089 memorystatus_jetsam_policy &= ~kPolicyDiagnoseActive;
1090
1091 switch (jetsam_diagnostic_mode) {
1092 case kJetsamDiagnosticModeNone:
1093 /* Already cleared */
1094 break;
1095 case kJetsamDiagnosticModeAll:
1096 memorystatus_jetsam_policy |= kPolicyDiagnoseAll;
1097 break;
1098 case kJetsamDiagnosticModeStopAtFirstActive:
1099 memorystatus_jetsam_policy |= kPolicyDiagnoseFirst;
1100 break;
1101 default:
1102 /* Already validated */
1103 break;
1104 }
1105
1106 memorystatus_update_levels_locked(FALSE);
1107 changed = TRUE;
1108 }
1109
1110 proc_list_unlock();
1111
1112 if (changed) {
1113 printf("%s\n", diagnosticStrings[val]);
1114 }
1115
1116 return (0);
1117 }
1118
1119 SYSCTL_PROC(_debug, OID_AUTO, jetsam_diagnostic_mode, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED|CTLFLAG_ANYBODY,
1120 &jetsam_diagnostic_mode, 0, sysctl_jetsam_diagnostic_mode, "I", "Jetsam Diagnostic Mode");
1121
1122 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jetsam_policy_offset_pages_diagnostic, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_jetsam_policy_offset_pages_diagnostic, 0, "");
1123
1124 #if VM_PRESSURE_EVENTS
1125
1126 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_pressure, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_pressure, 0, "");
1127
1128 #endif /* VM_PRESSURE_EVENTS */
1129
1130 #endif /* CONFIG_JETSAM */
1131
1132 #if CONFIG_FREEZE
1133
1134 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_daily_mb_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_daily_mb_max, 0, "");
1135
1136 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_threshold, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_threshold, 0, "");
1137
1138 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_min, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_min, 0, "");
1139 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_max, 0, "");
1140
1141 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_count, "");
1142 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_pageouts, "");
1143 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_throttle_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_count, "");
1144 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_min_processes, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_suspended_threshold, 0, "");
1145
1146 boolean_t memorystatus_freeze_throttle_enabled = TRUE;
1147 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_throttle_enabled, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_enabled, 0, "");
1148
1149 #define VM_PAGES_FOR_ALL_PROCS (2)
1150 /*
1151 * Manual trigger of freeze and thaw for dev / debug kernels only.
1152 */
1153 static int
1154 sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS
1155 {
1156 #pragma unused(arg1, arg2)
1157 int error, pid = 0;
1158 proc_t p;
1159
1160 if (memorystatus_freeze_enabled == FALSE) {
1161 return ENOTSUP;
1162 }
1163
1164 error = sysctl_handle_int(oidp, &pid, 0, req);
1165 if (error || !req->newptr)
1166 return (error);
1167
1168 if (pid == VM_PAGES_FOR_ALL_PROCS) {
1169 vm_pageout_anonymous_pages();
1170
1171 return 0;
1172 }
1173
1174 lck_mtx_lock(&freezer_mutex);
1175
1176 p = proc_find(pid);
1177 if (p != NULL) {
1178 uint32_t purgeable, wired, clean, dirty;
1179 boolean_t shared;
1180 uint32_t max_pages = 0;
1181
1182 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
1183
1184 unsigned int avail_swap_space = 0; /* in pages. */
1185
1186 /*
1187 * Freezer backed by the compressor and swap file(s)
1188 * while will hold compressed data.
1189 */
1190 avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64;
1191
1192 max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max);
1193
1194 } else {
1195 /*
1196 * We only have the compressor without any swap.
1197 */
1198 max_pages = UINT32_MAX - 1;
1199 }
1200
1201 error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
1202 proc_rele(p);
1203
1204 if (error)
1205 error = EIO;
1206
1207 lck_mtx_unlock(&freezer_mutex);
1208 return error;
1209 }
1210
1211 lck_mtx_unlock(&freezer_mutex);
1212 return EINVAL;
1213 }
1214
1215 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_freeze, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
1216 0, 0, &sysctl_memorystatus_freeze, "I", "");
1217
1218 static int
1219 sysctl_memorystatus_available_pages_thaw SYSCTL_HANDLER_ARGS
1220 {
1221 #pragma unused(arg1, arg2)
1222
1223 int error, pid = 0;
1224 proc_t p;
1225
1226 if (memorystatus_freeze_enabled == FALSE) {
1227 return ENOTSUP;
1228 }
1229
1230 error = sysctl_handle_int(oidp, &pid, 0, req);
1231 if (error || !req->newptr)
1232 return (error);
1233
1234 if (pid == VM_PAGES_FOR_ALL_PROCS) {
1235 do_fastwake_warmup_all();
1236 return 0;
1237 } else {
1238 p = proc_find(pid);
1239 if (p != NULL) {
1240 error = task_thaw(p->task);
1241 proc_rele(p);
1242
1243 if (error)
1244 error = EIO;
1245 return error;
1246 }
1247 }
1248
1249 return EINVAL;
1250 }
1251
1252 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_thaw, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
1253 0, 0, &sysctl_memorystatus_available_pages_thaw, "I", "");
1254
1255 #endif /* CONFIG_FREEZE */
1256
1257 #endif /* DEVELOPMENT || DEBUG */
1258
1259 extern kern_return_t kernel_thread_start_priority(thread_continue_t continuation,
1260 void *parameter,
1261 integer_t priority,
1262 thread_t *new_thread);
1263
1264 #if DEVELOPMENT || DEBUG
1265
1266 static int
1267 sysctl_memorystatus_disconnect_page_mappings SYSCTL_HANDLER_ARGS
1268 {
1269 #pragma unused(arg1, arg2)
1270 int error = 0, pid = 0;
1271 proc_t p;
1272
1273 error = sysctl_handle_int(oidp, &pid, 0, req);
1274 if (error || !req->newptr)
1275 return (error);
1276
1277 lck_mtx_lock(&disconnect_page_mappings_mutex);
1278
1279 if (pid == -1) {
1280 vm_pageout_disconnect_all_pages();
1281 } else {
1282 p = proc_find(pid);
1283
1284 if (p != NULL) {
1285 error = task_disconnect_page_mappings(p->task);
1286
1287 proc_rele(p);
1288
1289 if (error)
1290 error = EIO;
1291 } else
1292 error = EINVAL;
1293 }
1294 lck_mtx_unlock(&disconnect_page_mappings_mutex);
1295
1296 return error;
1297 }
1298
1299 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_disconnect_page_mappings, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
1300 0, 0, &sysctl_memorystatus_disconnect_page_mappings, "I", "");
1301
1302 #endif /* DEVELOPMENT || DEBUG */
1303
1304
1305 /*
1306 * Picks the sorting routine for a given jetsam priority band.
1307 *
1308 * Input:
1309 * bucket_index - jetsam priority band to be sorted.
1310 * sort_order - JETSAM_SORT_xxx from kern_memorystatus.h
1311 * Currently sort_order is only meaningful when handling
1312 * coalitions.
1313 *
1314 * Return:
1315 * 0 on success
1316 * non-0 on failure
1317 */
1318 static int memorystatus_sort_bucket(unsigned int bucket_index, int sort_order)
1319 {
1320 int coal_sort_order;
1321
1322 /*
1323 * Verify the jetsam priority
1324 */
1325 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1326 return(EINVAL);
1327 }
1328
1329 #if DEVELOPMENT || DEBUG
1330 if (sort_order == JETSAM_SORT_DEFAULT) {
1331 coal_sort_order = COALITION_SORT_DEFAULT;
1332 } else {
1333 coal_sort_order = sort_order; /* only used for testing scenarios */
1334 }
1335 #else
1336 /* Verify default */
1337 if (sort_order == JETSAM_SORT_DEFAULT) {
1338 coal_sort_order = COALITION_SORT_DEFAULT;
1339 } else {
1340 return(EINVAL);
1341 }
1342 #endif
1343
1344 proc_list_lock();
1345
1346 if (memstat_bucket[bucket_index].count == 0) {
1347 proc_list_unlock();
1348 return (0);
1349 }
1350
1351 switch (bucket_index) {
1352 case JETSAM_PRIORITY_FOREGROUND:
1353 if (memorystatus_sort_by_largest_coalition_locked(bucket_index, coal_sort_order) == 0) {
1354 /*
1355 * Fall back to per process sorting when zero coalitions are found.
1356 */
1357 memorystatus_sort_by_largest_process_locked(bucket_index);
1358 }
1359 break;
1360 default:
1361 memorystatus_sort_by_largest_process_locked(bucket_index);
1362 break;
1363 }
1364 proc_list_unlock();
1365
1366 return(0);
1367 }
1368
1369 /*
1370 * Sort processes by size for a single jetsam bucket.
1371 */
1372
1373 static void memorystatus_sort_by_largest_process_locked(unsigned int bucket_index)
1374 {
1375 proc_t p = NULL, insert_after_proc = NULL, max_proc = NULL;
1376 proc_t next_p = NULL, prev_max_proc = NULL;
1377 uint32_t pages = 0, max_pages = 0;
1378 memstat_bucket_t *current_bucket;
1379
1380 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1381 return;
1382 }
1383
1384 current_bucket = &memstat_bucket[bucket_index];
1385
1386 p = TAILQ_FIRST(&current_bucket->list);
1387
1388 while (p) {
1389 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
1390 max_pages = pages;
1391 max_proc = p;
1392 prev_max_proc = p;
1393
1394 while ((next_p = TAILQ_NEXT(p, p_memstat_list)) != NULL) {
1395 /* traversing list until we find next largest process */
1396 p=next_p;
1397 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
1398 if (pages > max_pages) {
1399 max_pages = pages;
1400 max_proc = p;
1401 }
1402 }
1403
1404 if (prev_max_proc != max_proc) {
1405 /* found a larger process, place it in the list */
1406 TAILQ_REMOVE(&current_bucket->list, max_proc, p_memstat_list);
1407 if (insert_after_proc == NULL) {
1408 TAILQ_INSERT_HEAD(&current_bucket->list, max_proc, p_memstat_list);
1409 } else {
1410 TAILQ_INSERT_AFTER(&current_bucket->list, insert_after_proc, max_proc, p_memstat_list);
1411 }
1412 prev_max_proc = max_proc;
1413 }
1414
1415 insert_after_proc = max_proc;
1416
1417 p = TAILQ_NEXT(max_proc, p_memstat_list);
1418 }
1419 }
1420
1421 static proc_t memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search) {
1422 memstat_bucket_t *current_bucket;
1423 proc_t next_p;
1424
1425 if ((*bucket_index) >= MEMSTAT_BUCKET_COUNT) {
1426 return NULL;
1427 }
1428
1429 current_bucket = &memstat_bucket[*bucket_index];
1430 next_p = TAILQ_FIRST(&current_bucket->list);
1431 if (!next_p && search) {
1432 while (!next_p && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
1433 current_bucket = &memstat_bucket[*bucket_index];
1434 next_p = TAILQ_FIRST(&current_bucket->list);
1435 }
1436 }
1437
1438 return next_p;
1439 }
1440
1441 static proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search) {
1442 memstat_bucket_t *current_bucket;
1443 proc_t next_p;
1444
1445 if (!p || ((*bucket_index) >= MEMSTAT_BUCKET_COUNT)) {
1446 return NULL;
1447 }
1448
1449 next_p = TAILQ_NEXT(p, p_memstat_list);
1450 while (!next_p && search && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
1451 current_bucket = &memstat_bucket[*bucket_index];
1452 next_p = TAILQ_FIRST(&current_bucket->list);
1453 }
1454
1455 return next_p;
1456 }
1457
1458 __private_extern__ void
1459 memorystatus_init(void)
1460 {
1461 thread_t thread = THREAD_NULL;
1462 kern_return_t result;
1463 int i;
1464
1465 #if CONFIG_FREEZE
1466 memorystatus_freeze_pages_min = FREEZE_PAGES_MIN;
1467 memorystatus_freeze_pages_max = FREEZE_PAGES_MAX;
1468 #endif
1469
1470 #if DEVELOPMENT || DEBUG
1471 disconnect_page_mappings_lck_grp_attr = lck_grp_attr_alloc_init();
1472 disconnect_page_mappings_lck_grp = lck_grp_alloc_init("disconnect_page_mappings", disconnect_page_mappings_lck_grp_attr);
1473
1474 lck_mtx_init(&disconnect_page_mappings_mutex, disconnect_page_mappings_lck_grp, NULL);
1475
1476 if (kill_on_no_paging_space == TRUE) {
1477 max_kill_priority = JETSAM_PRIORITY_MAX;
1478 }
1479 #endif
1480
1481
1482 /* Init buckets */
1483 for (i = 0; i < MEMSTAT_BUCKET_COUNT; i++) {
1484 TAILQ_INIT(&memstat_bucket[i].list);
1485 memstat_bucket[i].count = 0;
1486 }
1487 memorystatus_idle_demotion_call = thread_call_allocate((thread_call_func_t)memorystatus_perform_idle_demotion, NULL);
1488
1489 #if CONFIG_JETSAM
1490 nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time);
1491 nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time);
1492
1493 /* Apply overrides */
1494 PE_get_default("kern.jetsam_delta", &delta_percentage, sizeof(delta_percentage));
1495 if (delta_percentage == 0) {
1496 delta_percentage = 5;
1497 }
1498 assert(delta_percentage < 100);
1499 PE_get_default("kern.jetsam_critical_threshold", &critical_threshold_percentage, sizeof(critical_threshold_percentage));
1500 assert(critical_threshold_percentage < 100);
1501 PE_get_default("kern.jetsam_idle_offset", &idle_offset_percentage, sizeof(idle_offset_percentage));
1502 assert(idle_offset_percentage < 100);
1503 PE_get_default("kern.jetsam_pressure_threshold", &pressure_threshold_percentage, sizeof(pressure_threshold_percentage));
1504 assert(pressure_threshold_percentage < 100);
1505 PE_get_default("kern.jetsam_freeze_threshold", &freeze_threshold_percentage, sizeof(freeze_threshold_percentage));
1506 assert(freeze_threshold_percentage < 100);
1507
1508 if (!PE_parse_boot_argn("jetsam_aging_policy", &jetsam_aging_policy,
1509 sizeof (jetsam_aging_policy))) {
1510
1511 if (!PE_get_default("kern.jetsam_aging_policy", &jetsam_aging_policy,
1512 sizeof(jetsam_aging_policy))) {
1513
1514 jetsam_aging_policy = kJetsamAgingPolicyLegacy;
1515 }
1516 }
1517
1518 if (jetsam_aging_policy > kJetsamAgingPolicyMax) {
1519 jetsam_aging_policy = kJetsamAgingPolicyLegacy;
1520 }
1521
1522 switch (jetsam_aging_policy) {
1523
1524 case kJetsamAgingPolicyNone:
1525 system_procs_aging_band = JETSAM_PRIORITY_IDLE;
1526 applications_aging_band = JETSAM_PRIORITY_IDLE;
1527 break;
1528
1529 case kJetsamAgingPolicyLegacy:
1530 /*
1531 * Legacy behavior where some daemons get a 10s protection once
1532 * AND only before the first clean->dirty->clean transition before
1533 * going into IDLE band.
1534 */
1535 system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
1536 applications_aging_band = JETSAM_PRIORITY_IDLE;
1537 break;
1538
1539 case kJetsamAgingPolicySysProcsReclaimedFirst:
1540 system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
1541 applications_aging_band = JETSAM_PRIORITY_AGING_BAND2;
1542 break;
1543
1544 case kJetsamAgingPolicyAppsReclaimedFirst:
1545 system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND2;
1546 applications_aging_band = JETSAM_PRIORITY_AGING_BAND1;
1547 break;
1548
1549 default:
1550 break;
1551 }
1552
1553 /*
1554 * The aging bands cannot overlap with the JETSAM_PRIORITY_ELEVATED_INACTIVE
1555 * band and must be below it in priority. This is so that we don't have to make
1556 * our 'aging' code worry about a mix of processes, some of which need to age
1557 * and some others that need to stay elevated in the jetsam bands.
1558 */
1559 assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > system_procs_aging_band);
1560 assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > applications_aging_band);
1561
1562 /* Take snapshots for idle-exit kills by default? First check the boot-arg... */
1563 if (!PE_parse_boot_argn("jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof (memorystatus_idle_snapshot))) {
1564 /* ...no boot-arg, so check the device tree */
1565 PE_get_default("kern.jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot));
1566 }
1567
1568 memorystatus_delta = delta_percentage * atop_64(max_mem) / 100;
1569 memorystatus_available_pages_critical_idle_offset = idle_offset_percentage * atop_64(max_mem) / 100;
1570 memorystatus_available_pages_critical_base = (critical_threshold_percentage / delta_percentage) * memorystatus_delta;
1571 memorystatus_policy_more_free_offset_pages = (policy_more_free_offset_percentage / delta_percentage) * memorystatus_delta;
1572
1573 /* Jetsam Loop Detection */
1574 if (max_mem <= (512 * 1024 * 1024)) {
1575 /* 512 MB devices */
1576 memorystatus_jld_eval_period_msecs = 8000; /* 8000 msecs == 8 second window */
1577 } else {
1578 /* 1GB and larger devices */
1579 memorystatus_jld_eval_period_msecs = 6000; /* 6000 msecs == 6 second window */
1580 }
1581
1582 memorystatus_jld_enabled = TRUE;
1583
1584 /* No contention at this point */
1585 memorystatus_update_levels_locked(FALSE);
1586
1587 #endif /* CONFIG_JETSAM */
1588
1589 memorystatus_jetsam_snapshot_max = maxproc;
1590 memorystatus_jetsam_snapshot =
1591 (memorystatus_jetsam_snapshot_t*)kalloc(sizeof(memorystatus_jetsam_snapshot_t) +
1592 sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max);
1593 if (!memorystatus_jetsam_snapshot) {
1594 panic("Could not allocate memorystatus_jetsam_snapshot");
1595 }
1596
1597 nanoseconds_to_absolutetime((uint64_t)JETSAM_SNAPSHOT_TIMEOUT_SECS * NSEC_PER_SEC, &memorystatus_jetsam_snapshot_timeout);
1598
1599 memset(&memorystatus_at_boot_snapshot, 0, sizeof(memorystatus_jetsam_snapshot_t));
1600
1601 #if CONFIG_FREEZE
1602 memorystatus_freeze_threshold = (freeze_threshold_percentage / delta_percentage) * memorystatus_delta;
1603 #endif
1604
1605 result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &thread);
1606 if (result == KERN_SUCCESS) {
1607 thread_deallocate(thread);
1608 } else {
1609 panic("Could not create memorystatus_thread");
1610 }
1611 }
1612
1613 /* Centralised for the purposes of allowing panic-on-jetsam */
1614 extern void
1615 vm_run_compactor(void);
1616
1617 /*
1618 * The jetsam no frills kill call
1619 * Return: 0 on success
1620 * error code on failure (EINVAL...)
1621 */
1622 static int
1623 jetsam_do_kill(proc_t p, int jetsam_flags, os_reason_t jetsam_reason) {
1624 int error = 0;
1625 error = exit_with_reason(p, W_EXITCODE(0, SIGKILL), (int *)NULL, FALSE, FALSE, jetsam_flags, jetsam_reason);
1626 return(error);
1627 }
1628
1629 /*
1630 * Wrapper for processes exiting with memorystatus details
1631 */
1632 static boolean_t
1633 memorystatus_do_kill(proc_t p, uint32_t cause, os_reason_t jetsam_reason) {
1634
1635 int error = 0;
1636 __unused pid_t victim_pid = p->p_pid;
1637
1638 KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_START,
1639 victim_pid, cause, vm_page_free_count, 0, 0);
1640
1641 DTRACE_MEMORYSTATUS3(memorystatus_do_kill, proc_t, p, os_reason_t, jetsam_reason, uint32_t, cause);
1642 #if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
1643 if (memorystatus_jetsam_panic_debug & (1 << cause)) {
1644 panic("memorystatus_do_kill(): jetsam debug panic (cause: %d)", cause);
1645 }
1646 #else
1647 #pragma unused(cause)
1648 #endif
1649
1650 if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) {
1651 printf("memorystatus: killing process %d [%s] in high band %s (%d) - memorystatus_available_pages: %llu\n", p->p_pid,
1652 (*p->p_name ? p->p_name : "unknown"),
1653 memorystatus_priority_band_name(p->p_memstat_effectivepriority), p->p_memstat_effectivepriority,
1654 (uint64_t)memorystatus_available_pages);
1655 }
1656
1657 int jetsam_flags = P_LTERM_JETSAM;
1658 switch (cause) {
1659 case kMemorystatusKilledHiwat: jetsam_flags |= P_JETSAM_HIWAT; break;
1660 case kMemorystatusKilledVnodes: jetsam_flags |= P_JETSAM_VNODE; break;
1661 case kMemorystatusKilledVMPageShortage: jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break;
1662 case kMemorystatusKilledVMThrashing: jetsam_flags |= P_JETSAM_VMTHRASHING; break;
1663 case kMemorystatusKilledFCThrashing: jetsam_flags |= P_JETSAM_FCTHRASHING; break;
1664 case kMemorystatusKilledPerProcessLimit: jetsam_flags |= P_JETSAM_PID; break;
1665 case kMemorystatusKilledIdleExit: jetsam_flags |= P_JETSAM_IDLEEXIT; break;
1666 }
1667 error = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
1668
1669 KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_END,
1670 victim_pid, cause, vm_page_free_count, error, 0);
1671
1672 vm_run_compactor();
1673
1674 return (error == 0);
1675 }
1676
1677 /*
1678 * Node manipulation
1679 */
1680
1681 static void
1682 memorystatus_check_levels_locked(void) {
1683 #if CONFIG_JETSAM
1684 /* Update levels */
1685 memorystatus_update_levels_locked(TRUE);
1686 #else /* CONFIG_JETSAM */
1687 /*
1688 * Nothing to do here currently since we update
1689 * memorystatus_available_pages in vm_pressure_response.
1690 */
1691 #endif /* CONFIG_JETSAM */
1692 }
1693
1694 /*
1695 * Pin a process to a particular jetsam band when it is in the background i.e. not doing active work.
1696 * For an application: that means no longer in the FG band
1697 * For a daemon: that means no longer in its 'requested' jetsam priority band
1698 */
1699
1700 int
1701 memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags, boolean_t effective_now)
1702 {
1703 int error = 0;
1704 boolean_t enable = FALSE;
1705 proc_t p = NULL;
1706
1707 if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE) {
1708 enable = TRUE;
1709 } else if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE) {
1710 enable = FALSE;
1711 } else {
1712 return EINVAL;
1713 }
1714
1715 p = proc_find(pid);
1716 if (p != NULL) {
1717
1718 if ((enable && ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) == P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) ||
1719 (!enable && ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) == 0))) {
1720 /*
1721 * No change in state.
1722 */
1723
1724 } else {
1725
1726 proc_list_lock();
1727
1728 if (enable) {
1729 p->p_memstat_state |= P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
1730 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1731
1732 if (effective_now) {
1733 if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_ELEVATED_INACTIVE) {
1734 if(memorystatus_highwater_enabled) {
1735 /*
1736 * Process is about to transition from
1737 * inactive --> active
1738 * assign active state
1739 */
1740 boolean_t is_fatal;
1741 boolean_t use_active = TRUE;
1742 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
1743 task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, use_active, is_fatal);
1744 }
1745 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_ELEVATED_INACTIVE, FALSE, FALSE);
1746 }
1747 } else {
1748 if (isProcessInAgingBands(p)) {
1749 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
1750 }
1751 }
1752 } else {
1753
1754 p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
1755 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1756
1757 if (effective_now) {
1758 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_ELEVATED_INACTIVE) {
1759 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
1760 }
1761 } else {
1762 if (isProcessInAgingBands(p)) {
1763 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
1764 }
1765 }
1766 }
1767
1768 proc_list_unlock();
1769 }
1770 proc_rele(p);
1771 error = 0;
1772
1773 } else {
1774 error = ESRCH;
1775 }
1776
1777 return error;
1778 }
1779
1780 static void
1781 memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2)
1782 {
1783 proc_t p;
1784 uint64_t current_time = 0, idle_delay_time = 0;
1785 int demote_prio_band = 0;
1786 memstat_bucket_t *demotion_bucket;
1787
1788 MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion()\n");
1789
1790 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_START, 0, 0, 0, 0, 0);
1791
1792 current_time = mach_absolute_time();
1793
1794 proc_list_lock();
1795
1796 demote_prio_band = JETSAM_PRIORITY_IDLE + 1;
1797
1798 for (; demote_prio_band < JETSAM_PRIORITY_MAX; demote_prio_band++) {
1799
1800 if (demote_prio_band != system_procs_aging_band && demote_prio_band != applications_aging_band)
1801 continue;
1802
1803 demotion_bucket = &memstat_bucket[demote_prio_band];
1804 p = TAILQ_FIRST(&demotion_bucket->list);
1805
1806 while (p) {
1807 MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion() found %d\n", p->p_pid);
1808
1809 assert(p->p_memstat_idledeadline);
1810
1811 assert(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS);
1812
1813 if (current_time >= p->p_memstat_idledeadline) {
1814
1815 if ((isSysProc(p) &&
1816 ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_IS_DIRTY)) != P_DIRTY_IDLE_EXIT_ENABLED)) || /* system proc marked dirty*/
1817 task_has_assertions((struct task *)(p->task))) { /* has outstanding assertions which might indicate outstanding work too */
1818 idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_delay_time : memorystatus_apps_idle_delay_time;
1819
1820 p->p_memstat_idledeadline += idle_delay_time;
1821 p = TAILQ_NEXT(p, p_memstat_list);
1822
1823 } else {
1824
1825 proc_t next_proc = NULL;
1826
1827 next_proc = TAILQ_NEXT(p, p_memstat_list);
1828 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1829
1830 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, false, true);
1831
1832 p = next_proc;
1833 continue;
1834
1835 }
1836 } else {
1837 // No further candidates
1838 break;
1839 }
1840 }
1841
1842 }
1843
1844 memorystatus_reschedule_idle_demotion_locked();
1845
1846 proc_list_unlock();
1847
1848 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
1849 }
1850
1851 static void
1852 memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state)
1853 {
1854 boolean_t present_in_sysprocs_aging_bucket = FALSE;
1855 boolean_t present_in_apps_aging_bucket = FALSE;
1856 uint64_t idle_delay_time = 0;
1857
1858 if (jetsam_aging_policy == kJetsamAgingPolicyNone) {
1859 return;
1860 }
1861
1862 if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) {
1863 /*
1864 * This process isn't going to be making the trip to the lower bands.
1865 */
1866 return;
1867 }
1868
1869 if (isProcessInAgingBands(p)){
1870
1871 if (jetsam_aging_policy != kJetsamAgingPolicyLegacy) {
1872 assert((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) != P_DIRTY_AGING_IN_PROGRESS);
1873 }
1874
1875 if (isSysProc(p) && system_procs_aging_band) {
1876 present_in_sysprocs_aging_bucket = TRUE;
1877
1878 } else if (isApp(p) && applications_aging_band) {
1879 present_in_apps_aging_bucket = TRUE;
1880 }
1881 }
1882
1883 assert(!present_in_sysprocs_aging_bucket);
1884 assert(!present_in_apps_aging_bucket);
1885
1886 MEMORYSTATUS_DEBUG(1, "memorystatus_schedule_idle_demotion_locked: scheduling demotion to idle band for pid %d (dirty:0x%x, set_state %d, demotions %d).\n",
1887 p->p_pid, p->p_memstat_dirty, set_state, (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps));
1888
1889 if(isSysProc(p)) {
1890 assert((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED);
1891 }
1892
1893 idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_delay_time : memorystatus_apps_idle_delay_time;
1894
1895 if (set_state) {
1896 p->p_memstat_dirty |= P_DIRTY_AGING_IN_PROGRESS;
1897 p->p_memstat_idledeadline = mach_absolute_time() + idle_delay_time;
1898 }
1899
1900 assert(p->p_memstat_idledeadline);
1901
1902 if (isSysProc(p) && present_in_sysprocs_aging_bucket == FALSE) {
1903 memorystatus_scheduled_idle_demotions_sysprocs++;
1904
1905 } else if (isApp(p) && present_in_apps_aging_bucket == FALSE) {
1906 memorystatus_scheduled_idle_demotions_apps++;
1907 }
1908 }
1909
1910 static void
1911 memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clear_state)
1912 {
1913 boolean_t present_in_sysprocs_aging_bucket = FALSE;
1914 boolean_t present_in_apps_aging_bucket = FALSE;
1915
1916 if (!system_procs_aging_band && !applications_aging_band) {
1917 return;
1918 }
1919
1920 if ((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) == 0) {
1921 return;
1922 }
1923
1924 if (isProcessInAgingBands(p)) {
1925
1926 if (jetsam_aging_policy != kJetsamAgingPolicyLegacy) {
1927 assert((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) == P_DIRTY_AGING_IN_PROGRESS);
1928 }
1929
1930 if (isSysProc(p) && system_procs_aging_band) {
1931 assert(p->p_memstat_effectivepriority == system_procs_aging_band);
1932 assert(p->p_memstat_idledeadline);
1933 present_in_sysprocs_aging_bucket = TRUE;
1934
1935 } else if (isApp(p) && applications_aging_band) {
1936 assert(p->p_memstat_effectivepriority == applications_aging_band);
1937 assert(p->p_memstat_idledeadline);
1938 present_in_apps_aging_bucket = TRUE;
1939 }
1940 }
1941
1942 MEMORYSTATUS_DEBUG(1, "memorystatus_invalidate_idle_demotion(): invalidating demotion to idle band for pid %d (clear_state %d, demotions %d).\n",
1943 p->p_pid, clear_state, (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps));
1944
1945
1946 if (clear_state) {
1947 p->p_memstat_idledeadline = 0;
1948 p->p_memstat_dirty &= ~P_DIRTY_AGING_IN_PROGRESS;
1949 }
1950
1951 if (isSysProc(p) &&present_in_sysprocs_aging_bucket == TRUE) {
1952 memorystatus_scheduled_idle_demotions_sysprocs--;
1953 assert(memorystatus_scheduled_idle_demotions_sysprocs >= 0);
1954
1955 } else if (isApp(p) && present_in_apps_aging_bucket == TRUE) {
1956 memorystatus_scheduled_idle_demotions_apps--;
1957 assert(memorystatus_scheduled_idle_demotions_apps >= 0);
1958 }
1959
1960 assert((memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps) >= 0);
1961 }
1962
1963 static void
1964 memorystatus_reschedule_idle_demotion_locked(void) {
1965 if (0 == (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps)) {
1966 if (memstat_idle_demotion_deadline) {
1967 /* Transitioned 1->0, so cancel next call */
1968 thread_call_cancel(memorystatus_idle_demotion_call);
1969 memstat_idle_demotion_deadline = 0;
1970 }
1971 } else {
1972 memstat_bucket_t *demotion_bucket;
1973 proc_t p = NULL, p1 = NULL, p2 = NULL;
1974
1975 if (system_procs_aging_band) {
1976
1977 demotion_bucket = &memstat_bucket[system_procs_aging_band];
1978 p1 = TAILQ_FIRST(&demotion_bucket->list);
1979
1980 p = p1;
1981 }
1982
1983 if (applications_aging_band) {
1984
1985 demotion_bucket = &memstat_bucket[applications_aging_band];
1986 p2 = TAILQ_FIRST(&demotion_bucket->list);
1987
1988 if (p1 && p2) {
1989 p = (p1->p_memstat_idledeadline > p2->p_memstat_idledeadline) ? p2 : p1;
1990 } else {
1991 p = (p1 == NULL) ? p2 : p1;
1992 }
1993
1994 }
1995
1996 assert(p);
1997
1998 if (p != NULL) {
1999 assert(p && p->p_memstat_idledeadline);
2000 if (memstat_idle_demotion_deadline != p->p_memstat_idledeadline){
2001 thread_call_enter_delayed(memorystatus_idle_demotion_call, p->p_memstat_idledeadline);
2002 memstat_idle_demotion_deadline = p->p_memstat_idledeadline;
2003 }
2004 }
2005 }
2006 }
2007
2008 /*
2009 * List manipulation
2010 */
2011
2012 int
2013 memorystatus_add(proc_t p, boolean_t locked)
2014 {
2015 memstat_bucket_t *bucket;
2016
2017 MEMORYSTATUS_DEBUG(1, "memorystatus_list_add(): adding pid %d with priority %d.\n", p->p_pid, p->p_memstat_effectivepriority);
2018
2019 if (!locked) {
2020 proc_list_lock();
2021 }
2022
2023 DTRACE_MEMORYSTATUS2(memorystatus_add, proc_t, p, int32_t, p->p_memstat_effectivepriority);
2024
2025 /* Processes marked internal do not have priority tracked */
2026 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
2027 goto exit;
2028 }
2029
2030 bucket = &memstat_bucket[p->p_memstat_effectivepriority];
2031
2032 if (isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) {
2033 assert(bucket->count == memorystatus_scheduled_idle_demotions_sysprocs - 1);
2034
2035 } else if (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)) {
2036 assert(bucket->count == memorystatus_scheduled_idle_demotions_apps - 1);
2037
2038 } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2039 /*
2040 * Entering the idle band.
2041 * Record idle start time.
2042 */
2043 p->p_memstat_idle_start = mach_absolute_time();
2044 }
2045
2046 TAILQ_INSERT_TAIL(&bucket->list, p, p_memstat_list);
2047 bucket->count++;
2048
2049 memorystatus_list_count++;
2050
2051 memorystatus_check_levels_locked();
2052
2053 exit:
2054 if (!locked) {
2055 proc_list_unlock();
2056 }
2057
2058 return 0;
2059 }
2060
2061 /*
2062 * Description:
2063 * Moves a process from one jetsam bucket to another.
2064 * which changes the LRU position of the process.
2065 *
2066 * Monitors transition between buckets and if necessary
2067 * will update cached memory limits accordingly.
2068 *
2069 * skip_demotion_check:
2070 * - if the 'jetsam aging policy' is NOT 'legacy':
2071 * When this flag is TRUE, it means we are going
2072 * to age the ripe processes out of the aging bands and into the
2073 * IDLE band and apply their inactive memory limits.
2074 *
2075 * - if the 'jetsam aging policy' is 'legacy':
2076 * When this flag is TRUE, it might mean the above aging mechanism
2077 * OR
2078 * It might be that we have a process that has used up its 'idle deferral'
2079 * stay that is given to it once per lifetime. And in this case, the process
2080 * won't be going through any aging codepaths. But we still need to apply
2081 * the right inactive limits and so we explicitly set this to TRUE if the
2082 * new priority for the process is the IDLE band.
2083 */
2084 void
2085 memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert, boolean_t skip_demotion_check)
2086 {
2087 memstat_bucket_t *old_bucket, *new_bucket;
2088
2089 assert(priority < MEMSTAT_BUCKET_COUNT);
2090
2091 /* Ensure that exit isn't underway, leaving the proc retained but removed from its bucket */
2092 if ((p->p_listflag & P_LIST_EXITED) != 0) {
2093 return;
2094 }
2095
2096 MEMORYSTATUS_DEBUG(1, "memorystatus_update_priority_locked(): setting %s(%d) to priority %d, inserting at %s\n",
2097 (*p->p_name ? p->p_name : "unknown"), p->p_pid, priority, head_insert ? "head" : "tail");
2098
2099 DTRACE_MEMORYSTATUS3(memorystatus_update_priority, proc_t, p, int32_t, p->p_memstat_effectivepriority, int, priority);
2100
2101 #if DEVELOPMENT || DEBUG
2102 if (priority == JETSAM_PRIORITY_IDLE && /* if the process is on its way into the IDLE band */
2103 skip_demotion_check == FALSE && /* and it isn't via the path that will set the INACTIVE memlimits */
2104 (p->p_memstat_dirty & P_DIRTY_TRACK) && /* and it has 'DIRTY' tracking enabled */
2105 ((p->p_memstat_memlimit != p->p_memstat_memlimit_inactive) || /* and we notice that the current limit isn't the right value (inactive) */
2106 ((p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) ? ( ! (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT)) : (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT)))) /* OR type (fatal vs non-fatal) */
2107 panic("memorystatus_update_priority_locked: on %s with 0x%x, prio: %d and %d\n", p->p_name, p->p_memstat_state, priority, p->p_memstat_memlimit); /* then we must catch this */
2108 #endif /* DEVELOPMENT || DEBUG */
2109
2110 old_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
2111
2112 if (skip_demotion_check == FALSE) {
2113
2114 if (isSysProc(p)) {
2115 /*
2116 * For system processes, the memorystatus_dirty_* routines take care of adding/removing
2117 * the processes from the aging bands and balancing the demotion counts.
2118 * We can, however, override that if the process has an 'elevated inactive jetsam band' attribute.
2119 */
2120
2121 if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE && (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) {
2122 priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
2123
2124 assert(! (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS));
2125 }
2126 } else if (isApp(p)) {
2127
2128 /*
2129 * Check to see if the application is being lowered in jetsam priority. If so, and:
2130 * - it has an 'elevated inactive jetsam band' attribute, then put it in the JETSAM_PRIORITY_ELEVATED_INACTIVE band.
2131 * - it is a normal application, then let it age in the aging band if that policy is in effect.
2132 */
2133
2134 if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE && (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) {
2135 priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
2136 } else {
2137
2138 if (applications_aging_band) {
2139 if (p->p_memstat_effectivepriority == applications_aging_band) {
2140 assert(old_bucket->count == (memorystatus_scheduled_idle_demotions_apps + 1));
2141 }
2142
2143 if ((jetsam_aging_policy != kJetsamAgingPolicyLegacy) && (priority <= applications_aging_band)) {
2144 assert(! (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS));
2145 priority = applications_aging_band;
2146 memorystatus_schedule_idle_demotion_locked(p, TRUE);
2147 }
2148 }
2149 }
2150 }
2151 }
2152
2153 if ((system_procs_aging_band && (priority == system_procs_aging_band)) || (applications_aging_band && (priority == applications_aging_band))) {
2154 assert(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS);
2155 }
2156
2157 TAILQ_REMOVE(&old_bucket->list, p, p_memstat_list);
2158 old_bucket->count--;
2159
2160 new_bucket = &memstat_bucket[priority];
2161 if (head_insert)
2162 TAILQ_INSERT_HEAD(&new_bucket->list, p, p_memstat_list);
2163 else
2164 TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
2165 new_bucket->count++;
2166
2167 if (memorystatus_highwater_enabled) {
2168 boolean_t is_fatal;
2169 boolean_t use_active;
2170
2171 /*
2172 * If cached limit data is updated, then the limits
2173 * will be enforced by writing to the ledgers.
2174 */
2175 boolean_t ledger_update_needed = TRUE;
2176
2177 /*
2178 * Here, we must update the cached memory limit if the task
2179 * is transitioning between:
2180 * active <--> inactive
2181 * FG <--> BG
2182 * but:
2183 * dirty <--> clean is ignored
2184 *
2185 * We bypass non-idle processes that have opted into dirty tracking because
2186 * a move between buckets does not imply a transition between the
2187 * dirty <--> clean state.
2188 */
2189
2190 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
2191
2192 if (skip_demotion_check == TRUE && priority == JETSAM_PRIORITY_IDLE) {
2193 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
2194 use_active = FALSE;
2195 } else {
2196 ledger_update_needed = FALSE;
2197 }
2198
2199 } else if ((priority >= JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority < JETSAM_PRIORITY_FOREGROUND)) {
2200 /*
2201 * inactive --> active
2202 * BG --> FG
2203 * assign active state
2204 */
2205 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
2206 use_active = TRUE;
2207
2208 } else if ((priority < JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) {
2209 /*
2210 * active --> inactive
2211 * FG --> BG
2212 * assign inactive state
2213 */
2214 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
2215 use_active = FALSE;
2216 } else {
2217 /*
2218 * The transition between jetsam priority buckets apparently did
2219 * not affect active/inactive state.
2220 * This is not unusual... especially during startup when
2221 * processes are getting established in their respective bands.
2222 */
2223 ledger_update_needed = FALSE;
2224 }
2225
2226 /*
2227 * Enforce the new limits by writing to the ledger
2228 */
2229 if (ledger_update_needed) {
2230 task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, use_active, is_fatal);
2231
2232 MEMORYSTATUS_DEBUG(3, "memorystatus_update_priority_locked: new limit on pid %d (%dMB %s) priority old --> new (%d --> %d) dirty?=0x%x %s\n",
2233 p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
2234 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, priority, p->p_memstat_dirty,
2235 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
2236 }
2237 }
2238
2239 /*
2240 * Record idle start or idle delta.
2241 */
2242 if (p->p_memstat_effectivepriority == priority) {
2243 /*
2244 * This process is not transitioning between
2245 * jetsam priority buckets. Do nothing.
2246 */
2247 } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2248 uint64_t now;
2249 /*
2250 * Transitioning out of the idle priority bucket.
2251 * Record idle delta.
2252 */
2253 assert(p->p_memstat_idle_start != 0);
2254 now = mach_absolute_time();
2255 if (now > p->p_memstat_idle_start) {
2256 p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
2257 }
2258 } else if (priority == JETSAM_PRIORITY_IDLE) {
2259 /*
2260 * Transitioning into the idle priority bucket.
2261 * Record idle start.
2262 */
2263 p->p_memstat_idle_start = mach_absolute_time();
2264 }
2265
2266 p->p_memstat_effectivepriority = priority;
2267
2268 #if CONFIG_SECLUDED_MEMORY
2269 if (secluded_for_apps &&
2270 task_could_use_secluded_mem(p->task)) {
2271 task_set_can_use_secluded_mem(
2272 p->task,
2273 (priority >= JETSAM_PRIORITY_FOREGROUND));
2274 }
2275 #endif /* CONFIG_SECLUDED_MEMORY */
2276
2277 memorystatus_check_levels_locked();
2278 }
2279
2280 /*
2281 *
2282 * Description: Update the jetsam priority and memory limit attributes for a given process.
2283 *
2284 * Parameters:
2285 * p init this process's jetsam information.
2286 * priority The jetsam priority band
2287 * user_data user specific data, unused by the kernel
2288 * effective guards against race if process's update already occurred
2289 * update_memlimit When true we know this is the init step via the posix_spawn path.
2290 *
2291 * memlimit_active Value in megabytes; The monitored footprint level while the
2292 * process is active. Exceeding it may result in termination
2293 * based on it's associated fatal flag.
2294 *
2295 * memlimit_active_is_fatal When a process is active and exceeds its memory footprint,
2296 * this describes whether or not it should be immediately fatal.
2297 *
2298 * memlimit_inactive Value in megabytes; The monitored footprint level while the
2299 * process is inactive. Exceeding it may result in termination
2300 * based on it's associated fatal flag.
2301 *
2302 * memlimit_inactive_is_fatal When a process is inactive and exceeds its memory footprint,
2303 * this describes whether or not it should be immediatly fatal.
2304 *
2305 * Returns: 0 Success
2306 * non-0 Failure
2307 */
2308
2309 int
2310 memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t effective, boolean_t update_memlimit,
2311 int32_t memlimit_active, boolean_t memlimit_active_is_fatal,
2312 int32_t memlimit_inactive, boolean_t memlimit_inactive_is_fatal)
2313 {
2314 int ret;
2315 boolean_t head_insert = false;
2316
2317 MEMORYSTATUS_DEBUG(1, "memorystatus_update: changing (%s) pid %d: priority %d, user_data 0x%llx\n", (*p->p_name ? p->p_name : "unknown"), p->p_pid, priority, user_data);
2318
2319 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_START, p->p_pid, priority, user_data, effective, 0);
2320
2321 if (priority == -1) {
2322 /* Use as shorthand for default priority */
2323 priority = JETSAM_PRIORITY_DEFAULT;
2324 } else if ((priority == system_procs_aging_band) || (priority == applications_aging_band)) {
2325 /* Both the aging bands are reserved for internal use; if requested, adjust to JETSAM_PRIORITY_IDLE. */
2326 priority = JETSAM_PRIORITY_IDLE;
2327 } else if (priority == JETSAM_PRIORITY_IDLE_HEAD) {
2328 /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle queue */
2329 priority = JETSAM_PRIORITY_IDLE;
2330 head_insert = TRUE;
2331 } else if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) {
2332 /* Sanity check */
2333 ret = EINVAL;
2334 goto out;
2335 }
2336
2337 proc_list_lock();
2338
2339 assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
2340
2341 if (effective && (p->p_memstat_state & P_MEMSTAT_PRIORITYUPDATED)) {
2342 ret = EALREADY;
2343 proc_list_unlock();
2344 MEMORYSTATUS_DEBUG(1, "memorystatus_update: effective change specified for pid %d, but change already occurred.\n", p->p_pid);
2345 goto out;
2346 }
2347
2348 if ((p->p_memstat_state & P_MEMSTAT_TERMINATED) || ((p->p_listflag & P_LIST_EXITED) != 0)) {
2349 /*
2350 * This could happen when a process calling posix_spawn() is exiting on the jetsam thread.
2351 */
2352 ret = EBUSY;
2353 proc_list_unlock();
2354 goto out;
2355 }
2356
2357 p->p_memstat_state |= P_MEMSTAT_PRIORITYUPDATED;
2358 p->p_memstat_userdata = user_data;
2359 p->p_memstat_requestedpriority = priority;
2360
2361 if (update_memlimit) {
2362 boolean_t is_fatal;
2363 boolean_t use_active;
2364
2365 /*
2366 * Posix_spawn'd processes come through this path to instantiate ledger limits.
2367 * Forked processes do not come through this path, so no ledger limits exist.
2368 * (That's why forked processes can consume unlimited memory.)
2369 */
2370
2371 MEMORYSTATUS_DEBUG(3, "memorystatus_update(enter): pid %d, priority %d, dirty=0x%x, Active(%dMB %s), Inactive(%dMB, %s)\n",
2372 p->p_pid, priority, p->p_memstat_dirty,
2373 memlimit_active, (memlimit_active_is_fatal ? "F " : "NF"),
2374 memlimit_inactive, (memlimit_inactive_is_fatal ? "F " : "NF"));
2375
2376 if (memlimit_active <= 0) {
2377 /*
2378 * This process will have a system_wide task limit when active.
2379 * System_wide task limit is always fatal.
2380 * It's quite common to see non-fatal flag passed in here.
2381 * It's not an error, we just ignore it.
2382 */
2383
2384 /*
2385 * For backward compatibility with some unexplained launchd behavior,
2386 * we allow a zero sized limit. But we still enforce system_wide limit
2387 * when written to the ledgers.
2388 */
2389
2390 if (memlimit_active < 0) {
2391 memlimit_active = -1; /* enforces system_wide task limit */
2392 }
2393 memlimit_active_is_fatal = TRUE;
2394 }
2395
2396 if (memlimit_inactive <= 0) {
2397 /*
2398 * This process will have a system_wide task limit when inactive.
2399 * System_wide task limit is always fatal.
2400 */
2401
2402 memlimit_inactive = -1;
2403 memlimit_inactive_is_fatal = TRUE;
2404 }
2405
2406 /*
2407 * Initialize the active limit variants for this process.
2408 */
2409 SET_ACTIVE_LIMITS_LOCKED(p, memlimit_active, memlimit_active_is_fatal);
2410
2411 /*
2412 * Initialize the inactive limit variants for this process.
2413 */
2414 SET_INACTIVE_LIMITS_LOCKED(p, memlimit_inactive, memlimit_inactive_is_fatal);
2415
2416 /*
2417 * Initialize the cached limits for target process.
2418 * When the target process is dirty tracked, it's typically
2419 * in a clean state. Non dirty tracked processes are
2420 * typically active (Foreground or above).
2421 * But just in case, we don't make assumptions...
2422 */
2423
2424 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
2425 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
2426 use_active = TRUE;
2427 } else {
2428 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
2429 use_active = FALSE;
2430 }
2431
2432 /*
2433 * Enforce the cached limit by writing to the ledger.
2434 */
2435 if (memorystatus_highwater_enabled) {
2436 /* apply now */
2437 task_set_phys_footprint_limit_internal(p->task, ((p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1), NULL, use_active, is_fatal);
2438
2439 MEMORYSTATUS_DEBUG(3, "memorystatus_update: init: limit on pid %d (%dMB %s) targeting priority(%d) dirty?=0x%x %s\n",
2440 p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
2441 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), priority, p->p_memstat_dirty,
2442 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
2443 }
2444 }
2445
2446 /*
2447 * We can't add to the aging bands buckets here.
2448 * But, we could be removing it from those buckets.
2449 * Check and take appropriate steps if so.
2450 */
2451
2452 if (isProcessInAgingBands(p)) {
2453
2454 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
2455 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
2456 } else {
2457 if (jetsam_aging_policy == kJetsamAgingPolicyLegacy && priority == JETSAM_PRIORITY_IDLE) {
2458 /*
2459 * Daemons with 'inactive' limits will go through the dirty tracking codepath.
2460 * This path deals with apps that may have 'inactive' limits e.g. WebContent processes.
2461 * If this is the legacy aging policy we explicitly need to apply those limits. If it
2462 * is any other aging policy, then we don't need to worry because all processes
2463 * will go through the aging bands and then the demotion thread will take care to
2464 * move them into the IDLE band and apply the required limits.
2465 */
2466 memorystatus_update_priority_locked(p, priority, head_insert, TRUE);
2467 }
2468 }
2469
2470 memorystatus_update_priority_locked(p, priority, head_insert, FALSE);
2471
2472 proc_list_unlock();
2473 ret = 0;
2474
2475 out:
2476 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_END, ret, 0, 0, 0, 0);
2477
2478 return ret;
2479 }
2480
2481 int
2482 memorystatus_remove(proc_t p, boolean_t locked)
2483 {
2484 int ret;
2485 memstat_bucket_t *bucket;
2486 boolean_t reschedule = FALSE;
2487
2488 MEMORYSTATUS_DEBUG(1, "memorystatus_list_remove: removing pid %d\n", p->p_pid);
2489
2490 if (!locked) {
2491 proc_list_lock();
2492 }
2493
2494 assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
2495
2496 bucket = &memstat_bucket[p->p_memstat_effectivepriority];
2497
2498 if (isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) {
2499
2500 assert(bucket->count == memorystatus_scheduled_idle_demotions_sysprocs);
2501 reschedule = TRUE;
2502
2503 } else if (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)) {
2504
2505 assert(bucket->count == memorystatus_scheduled_idle_demotions_apps);
2506 reschedule = TRUE;
2507 }
2508
2509 /*
2510 * Record idle delta
2511 */
2512
2513 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2514 uint64_t now = mach_absolute_time();
2515 if (now > p->p_memstat_idle_start) {
2516 p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
2517 }
2518 }
2519
2520 TAILQ_REMOVE(&bucket->list, p, p_memstat_list);
2521 bucket->count--;
2522
2523 memorystatus_list_count--;
2524
2525 /* If awaiting demotion to the idle band, clean up */
2526 if (reschedule) {
2527 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
2528 memorystatus_reschedule_idle_demotion_locked();
2529 }
2530
2531 memorystatus_check_levels_locked();
2532
2533 #if CONFIG_FREEZE
2534 if (p->p_memstat_state & (P_MEMSTAT_FROZEN)) {
2535 memorystatus_frozen_count--;
2536 }
2537
2538 if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
2539 memorystatus_suspended_footprint_total -= p->p_memstat_suspendedfootprint;
2540 memorystatus_suspended_count--;
2541 }
2542 #endif
2543
2544 if (!locked) {
2545 proc_list_unlock();
2546 }
2547
2548 if (p) {
2549 ret = 0;
2550 } else {
2551 ret = ESRCH;
2552 }
2553
2554 return ret;
2555 }
2556
2557 /*
2558 * Validate dirty tracking flags with process state.
2559 *
2560 * Return:
2561 * 0 on success
2562 * non-0 on failure
2563 *
2564 * The proc_list_lock is held by the caller.
2565 */
2566
2567 static int
2568 memorystatus_validate_track_flags(struct proc *target_p, uint32_t pcontrol) {
2569 /* See that the process isn't marked for termination */
2570 if (target_p->p_memstat_dirty & P_DIRTY_TERMINATED) {
2571 return EBUSY;
2572 }
2573
2574 /* Idle exit requires that process be tracked */
2575 if ((pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) &&
2576 !(pcontrol & PROC_DIRTY_TRACK)) {
2577 return EINVAL;
2578 }
2579
2580 /* 'Launch in progress' tracking requires that process have enabled dirty tracking too. */
2581 if ((pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) &&
2582 !(pcontrol & PROC_DIRTY_TRACK)) {
2583 return EINVAL;
2584 }
2585
2586 /* Deferral is only relevant if idle exit is specified */
2587 if ((pcontrol & PROC_DIRTY_DEFER) &&
2588 !(pcontrol & PROC_DIRTY_ALLOWS_IDLE_EXIT)) {
2589 return EINVAL;
2590 }
2591
2592 return(0);
2593 }
2594
2595 static void
2596 memorystatus_update_idle_priority_locked(proc_t p) {
2597 int32_t priority;
2598
2599 MEMORYSTATUS_DEBUG(1, "memorystatus_update_idle_priority_locked(): pid %d dirty 0x%X\n", p->p_pid, p->p_memstat_dirty);
2600
2601 assert(isSysProc(p));
2602
2603 if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED) {
2604
2605 priority = (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) ? system_procs_aging_band : JETSAM_PRIORITY_IDLE;
2606 } else {
2607 priority = p->p_memstat_requestedpriority;
2608 }
2609
2610 if (priority != p->p_memstat_effectivepriority) {
2611
2612 if ((jetsam_aging_policy == kJetsamAgingPolicyLegacy) &&
2613 (priority == JETSAM_PRIORITY_IDLE)) {
2614
2615 /*
2616 * This process is on its way into the IDLE band. The system is
2617 * using 'legacy' jetsam aging policy. That means, this process
2618 * has already used up its idle-deferral aging time that is given
2619 * once per its lifetime. So we need to set the INACTIVE limits
2620 * explicitly because it won't be going through the demotion paths
2621 * that take care to apply the limits appropriately.
2622 */
2623
2624 if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) {
2625
2626 /*
2627 * This process has the 'elevated inactive jetsam band' attribute.
2628 * So, there will be no trip to IDLE after all.
2629 * Instead, we pin the process in the elevated band,
2630 * where its ACTIVE limits will apply.
2631 */
2632
2633 priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
2634 }
2635
2636 memorystatus_update_priority_locked(p, priority, false, true);
2637
2638 } else {
2639 memorystatus_update_priority_locked(p, priority, false, false);
2640 }
2641 }
2642 }
2643
2644 /*
2645 * Processes can opt to have their state tracked by the kernel, indicating when they are busy (dirty) or idle
2646 * (clean). They may also indicate that they support termination when idle, with the result that they are promoted
2647 * to their desired, higher, jetsam priority when dirty (and are therefore killed later), and demoted to the low
2648 * priority idle band when clean (and killed earlier, protecting higher priority procesess).
2649 *
2650 * If the deferral flag is set, then newly tracked processes will be protected for an initial period (as determined by
2651 * memorystatus_sysprocs_idle_delay_time); if they go clean during this time, then they will be moved to a deferred-idle band
2652 * with a slightly higher priority, guarding against immediate termination under memory pressure and being unable to
2653 * make forward progress. Finally, when the guard expires, they will be moved to the standard, lowest-priority, idle
2654 * band. The deferral can be cleared early by clearing the appropriate flag.
2655 *
2656 * The deferral timer is active only for the duration that the process is marked as guarded and clean; if the process
2657 * is marked dirty, the timer will be cancelled. Upon being subsequently marked clean, the deferment will either be
2658 * re-enabled or the guard state cleared, depending on whether the guard deadline has passed.
2659 */
2660
2661 int
2662 memorystatus_dirty_track(proc_t p, uint32_t pcontrol) {
2663 unsigned int old_dirty;
2664 boolean_t reschedule = FALSE;
2665 boolean_t already_deferred = FALSE;
2666 boolean_t defer_now = FALSE;
2667 int ret = 0;
2668
2669 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_TRACK),
2670 p->p_pid, p->p_memstat_dirty, pcontrol, 0, 0);
2671
2672 proc_list_lock();
2673
2674 if ((p->p_listflag & P_LIST_EXITED) != 0) {
2675 /*
2676 * Process is on its way out.
2677 */
2678 ret = EBUSY;
2679 goto exit;
2680 }
2681
2682 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
2683 ret = EPERM;
2684 goto exit;
2685 }
2686
2687 if ((ret = memorystatus_validate_track_flags(p, pcontrol)) != 0) {
2688 /* error */
2689 goto exit;
2690 }
2691
2692 old_dirty = p->p_memstat_dirty;
2693
2694 /* These bits are cumulative, as per <rdar://problem/11159924> */
2695 if (pcontrol & PROC_DIRTY_TRACK) {
2696 p->p_memstat_dirty |= P_DIRTY_TRACK;
2697 }
2698
2699 if (pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) {
2700 p->p_memstat_dirty |= P_DIRTY_ALLOW_IDLE_EXIT;
2701 }
2702
2703 if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
2704 p->p_memstat_dirty |= P_DIRTY_LAUNCH_IN_PROGRESS;
2705 }
2706
2707 if (old_dirty & P_DIRTY_AGING_IN_PROGRESS) {
2708 already_deferred = TRUE;
2709 }
2710
2711
2712 /* This can be set and cleared exactly once. */
2713 if (pcontrol & PROC_DIRTY_DEFER) {
2714
2715 if ( !(old_dirty & P_DIRTY_DEFER)) {
2716 p->p_memstat_dirty |= P_DIRTY_DEFER;
2717 }
2718
2719 defer_now = TRUE;
2720 }
2721
2722 MEMORYSTATUS_DEBUG(1, "memorystatus_on_track_dirty(): set idle-exit %s / defer %s / dirty %s for pid %d\n",
2723 ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) ? "Y" : "N",
2724 defer_now ? "Y" : "N",
2725 p->p_memstat_dirty & P_DIRTY ? "Y" : "N",
2726 p->p_pid);
2727
2728 /* Kick off or invalidate the idle exit deferment if there's a state transition. */
2729 if (!(p->p_memstat_dirty & P_DIRTY_IS_DIRTY)) {
2730 if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
2731
2732 if (defer_now && !already_deferred) {
2733
2734 /*
2735 * Request to defer a clean process that's idle-exit enabled
2736 * and not already in the jetsam deferred band. Most likely a
2737 * new launch.
2738 */
2739 memorystatus_schedule_idle_demotion_locked(p, TRUE);
2740 reschedule = TRUE;
2741
2742 } else if (!defer_now) {
2743
2744 /*
2745 * The process isn't asking for the 'aging' facility.
2746 * Could be that it is:
2747 */
2748
2749 if (already_deferred) {
2750 /*
2751 * already in the aging bands. Traditionally,
2752 * some processes have tried to use this to
2753 * opt out of the 'aging' facility.
2754 */
2755
2756 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
2757 } else {
2758 /*
2759 * agnostic to the 'aging' facility. In that case,
2760 * we'll go ahead and opt it in because this is likely
2761 * a new launch (clean process, dirty tracking enabled)
2762 */
2763
2764 memorystatus_schedule_idle_demotion_locked(p, TRUE);
2765 }
2766
2767 reschedule = TRUE;
2768 }
2769 }
2770 } else {
2771
2772 /*
2773 * We are trying to operate on a dirty process. Dirty processes have to
2774 * be removed from the deferred band. The question is do we reset the
2775 * deferred state or not?
2776 *
2777 * This could be a legal request like:
2778 * - this process had opted into the 'aging' band
2779 * - but it's now dirty and requests to opt out.
2780 * In this case, we remove the process from the band and reset its
2781 * state too. It'll opt back in properly when needed.
2782 *
2783 * OR, this request could be a user-space bug. E.g.:
2784 * - this process had opted into the 'aging' band when clean
2785 * - and, then issues another request to again put it into the band except
2786 * this time the process is dirty.
2787 * The process going dirty, as a transition in memorystatus_dirty_set(), will pull the process out of
2788 * the deferred band with its state intact. So our request below is no-op.
2789 * But we do it here anyways for coverage.
2790 *
2791 * memorystatus_update_idle_priority_locked()
2792 * single-mindedly treats a dirty process as "cannot be in the aging band".
2793 */
2794
2795 if (!defer_now && already_deferred) {
2796 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
2797 reschedule = TRUE;
2798 } else {
2799
2800 boolean_t reset_state = (jetsam_aging_policy != kJetsamAgingPolicyLegacy) ? TRUE : FALSE;
2801
2802 memorystatus_invalidate_idle_demotion_locked(p, reset_state);
2803 reschedule = TRUE;
2804 }
2805 }
2806
2807 memorystatus_update_idle_priority_locked(p);
2808
2809 if (reschedule) {
2810 memorystatus_reschedule_idle_demotion_locked();
2811 }
2812
2813 ret = 0;
2814
2815 exit:
2816 proc_list_unlock();
2817
2818 return ret;
2819 }
2820
2821 int
2822 memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) {
2823 int ret;
2824 boolean_t kill = false;
2825 boolean_t reschedule = FALSE;
2826 boolean_t was_dirty = FALSE;
2827 boolean_t now_dirty = FALSE;
2828
2829 MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_set(): %d %d 0x%x 0x%x\n", self, p->p_pid, pcontrol, p->p_memstat_dirty);
2830 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_SET), p->p_pid, self, pcontrol, 0, 0);
2831
2832 proc_list_lock();
2833
2834 if ((p->p_listflag & P_LIST_EXITED) != 0) {
2835 /*
2836 * Process is on its way out.
2837 */
2838 ret = EBUSY;
2839 goto exit;
2840 }
2841
2842 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
2843 ret = EPERM;
2844 goto exit;
2845 }
2846
2847 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY)
2848 was_dirty = TRUE;
2849
2850 if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
2851 /* Dirty tracking not enabled */
2852 ret = EINVAL;
2853 } else if (pcontrol && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
2854 /*
2855 * Process is set to be terminated and we're attempting to mark it dirty.
2856 * Set for termination and marking as clean is OK - see <rdar://problem/10594349>.
2857 */
2858 ret = EBUSY;
2859 } else {
2860 int flag = (self == TRUE) ? P_DIRTY : P_DIRTY_SHUTDOWN;
2861 if (pcontrol && !(p->p_memstat_dirty & flag)) {
2862 /* Mark the process as having been dirtied at some point */
2863 p->p_memstat_dirty |= (flag | P_DIRTY_MARKED);
2864 memorystatus_dirty_count++;
2865 ret = 0;
2866 } else if ((pcontrol == 0) && (p->p_memstat_dirty & flag)) {
2867 if ((flag == P_DIRTY_SHUTDOWN) && (!(p->p_memstat_dirty & P_DIRTY))) {
2868 /* Clearing the dirty shutdown flag, and the process is otherwise clean - kill */
2869 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
2870 kill = true;
2871 } else if ((flag == P_DIRTY) && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
2872 /* Kill previously terminated processes if set clean */
2873 kill = true;
2874 }
2875 p->p_memstat_dirty &= ~flag;
2876 memorystatus_dirty_count--;
2877 ret = 0;
2878 } else {
2879 /* Already set */
2880 ret = EALREADY;
2881 }
2882 }
2883
2884 if (ret != 0) {
2885 goto exit;
2886 }
2887
2888 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY)
2889 now_dirty = TRUE;
2890
2891 if ((was_dirty == TRUE && now_dirty == FALSE) ||
2892 (was_dirty == FALSE && now_dirty == TRUE)) {
2893
2894 /* Manage idle exit deferral, if applied */
2895 if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
2896
2897 /*
2898 * Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band OR it might be heading back
2899 * there once it's clean again. For the legacy case, this only applies if it has some protection window left.
2900 *
2901 * Non-Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band. It will always stop over
2902 * in that band on it's way to IDLE.
2903 */
2904
2905 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
2906 /*
2907 * New dirty process i.e. "was_dirty == FALSE && now_dirty == TRUE"
2908 *
2909 * The process will move from its aging band to its higher requested
2910 * jetsam band.
2911 */
2912 boolean_t reset_state = (jetsam_aging_policy != kJetsamAgingPolicyLegacy) ? TRUE : FALSE;
2913
2914 memorystatus_invalidate_idle_demotion_locked(p, reset_state);
2915 reschedule = TRUE;
2916 } else {
2917
2918 /*
2919 * Process is back from "dirty" to "clean".
2920 */
2921
2922 if (jetsam_aging_policy == kJetsamAgingPolicyLegacy) {
2923 if (mach_absolute_time() >= p->p_memstat_idledeadline) {
2924 /*
2925 * The process' deadline has expired. It currently
2926 * does not reside in any of the aging buckets.
2927 *
2928 * It's on its way to the JETSAM_PRIORITY_IDLE
2929 * bucket via memorystatus_update_idle_priority_locked()
2930 * below.
2931
2932 * So all we need to do is reset all the state on the
2933 * process that's related to the aging bucket i.e.
2934 * the AGING_IN_PROGRESS flag and the timer deadline.
2935 */
2936
2937 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
2938 reschedule = TRUE;
2939 } else {
2940 /*
2941 * It still has some protection window left and so
2942 * we just re-arm the timer without modifying any
2943 * state on the process iff it still wants into that band.
2944 */
2945
2946 if (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) {
2947 memorystatus_schedule_idle_demotion_locked(p, FALSE);
2948 reschedule = TRUE;
2949 }
2950 }
2951 } else {
2952
2953 memorystatus_schedule_idle_demotion_locked(p, TRUE);
2954 reschedule = TRUE;
2955 }
2956 }
2957 }
2958
2959 memorystatus_update_idle_priority_locked(p);
2960
2961 if (memorystatus_highwater_enabled) {
2962 boolean_t ledger_update_needed = TRUE;
2963 boolean_t use_active;
2964 boolean_t is_fatal;
2965 /*
2966 * We are in this path because this process transitioned between
2967 * dirty <--> clean state. Update the cached memory limits.
2968 */
2969
2970 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
2971 /*
2972 * process is pinned in elevated band
2973 * or
2974 * process is dirty
2975 */
2976 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
2977 use_active = TRUE;
2978 ledger_update_needed = TRUE;
2979 } else {
2980 /*
2981 * process is clean...but if it has opted into pressured-exit
2982 * we don't apply the INACTIVE limit till the process has aged
2983 * out and is entering the IDLE band.
2984 * See memorystatus_update_priority_locked() for that.
2985 */
2986
2987 if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) {
2988 ledger_update_needed = FALSE;
2989 } else {
2990 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
2991 use_active = FALSE;
2992 ledger_update_needed = TRUE;
2993 }
2994 }
2995
2996 /*
2997 * Enforce the new limits by writing to the ledger.
2998 *
2999 * This is a hot path and holding the proc_list_lock while writing to the ledgers,
3000 * (where the task lock is taken) is bad. So, we temporarily drop the proc_list_lock.
3001 * We aren't traversing the jetsam bucket list here, so we should be safe.
3002 * See rdar://21394491.
3003 */
3004
3005 if (ledger_update_needed && proc_ref_locked(p) == p) {
3006 int ledger_limit;
3007 if (p->p_memstat_memlimit > 0) {
3008 ledger_limit = p->p_memstat_memlimit;
3009 } else {
3010 ledger_limit = -1;
3011 }
3012 proc_list_unlock();
3013 task_set_phys_footprint_limit_internal(p->task, ledger_limit, NULL, use_active, is_fatal);
3014 proc_list_lock();
3015 proc_rele_locked(p);
3016
3017 MEMORYSTATUS_DEBUG(3, "memorystatus_dirty_set: new limit on pid %d (%dMB %s) priority(%d) dirty?=0x%x %s\n",
3018 p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
3019 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, p->p_memstat_dirty,
3020 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
3021 }
3022
3023 }
3024
3025 /* If the deferral state changed, reschedule the demotion timer */
3026 if (reschedule) {
3027 memorystatus_reschedule_idle_demotion_locked();
3028 }
3029 }
3030
3031 if (kill) {
3032 if (proc_ref_locked(p) == p) {
3033 proc_list_unlock();
3034 psignal(p, SIGKILL);
3035 proc_list_lock();
3036 proc_rele_locked(p);
3037 }
3038 }
3039
3040 exit:
3041 proc_list_unlock();
3042
3043 return ret;
3044 }
3045
3046 int
3047 memorystatus_dirty_clear(proc_t p, uint32_t pcontrol) {
3048
3049 int ret = 0;
3050
3051 MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_clear(): %d 0x%x 0x%x\n", p->p_pid, pcontrol, p->p_memstat_dirty);
3052
3053 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_CLEAR), p->p_pid, pcontrol, 0, 0, 0);
3054
3055 proc_list_lock();
3056
3057 if ((p->p_listflag & P_LIST_EXITED) != 0) {
3058 /*
3059 * Process is on its way out.
3060 */
3061 ret = EBUSY;
3062 goto exit;
3063 }
3064
3065 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3066 ret = EPERM;
3067 goto exit;
3068 }
3069
3070 if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
3071 /* Dirty tracking not enabled */
3072 ret = EINVAL;
3073 goto exit;
3074 }
3075
3076 if (!pcontrol || (pcontrol & (PROC_DIRTY_LAUNCH_IN_PROGRESS | PROC_DIRTY_DEFER)) == 0) {
3077 ret = EINVAL;
3078 goto exit;
3079 }
3080
3081 if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
3082 p->p_memstat_dirty &= ~P_DIRTY_LAUNCH_IN_PROGRESS;
3083 }
3084
3085 /* This can be set and cleared exactly once. */
3086 if (pcontrol & PROC_DIRTY_DEFER) {
3087
3088 if (p->p_memstat_dirty & P_DIRTY_DEFER) {
3089
3090 p->p_memstat_dirty &= ~P_DIRTY_DEFER;
3091
3092 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
3093 memorystatus_update_idle_priority_locked(p);
3094 memorystatus_reschedule_idle_demotion_locked();
3095 }
3096 }
3097
3098 ret = 0;
3099 exit:
3100 proc_list_unlock();
3101
3102 return ret;
3103 }
3104
3105 int
3106 memorystatus_dirty_get(proc_t p) {
3107 int ret = 0;
3108
3109 proc_list_lock();
3110
3111 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
3112 ret |= PROC_DIRTY_TRACKED;
3113 if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) {
3114 ret |= PROC_DIRTY_ALLOWS_IDLE_EXIT;
3115 }
3116 if (p->p_memstat_dirty & P_DIRTY) {
3117 ret |= PROC_DIRTY_IS_DIRTY;
3118 }
3119 if (p->p_memstat_dirty & P_DIRTY_LAUNCH_IN_PROGRESS) {
3120 ret |= PROC_DIRTY_LAUNCH_IS_IN_PROGRESS;
3121 }
3122 }
3123
3124 proc_list_unlock();
3125
3126 return ret;
3127 }
3128
3129 int
3130 memorystatus_on_terminate(proc_t p) {
3131 int sig;
3132
3133 proc_list_lock();
3134
3135 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
3136
3137 if ((p->p_memstat_dirty & (P_DIRTY_TRACK|P_DIRTY_IS_DIRTY)) == P_DIRTY_TRACK) {
3138 /* Clean; mark as terminated and issue SIGKILL */
3139 sig = SIGKILL;
3140 } else {
3141 /* Dirty, terminated, or state tracking is unsupported; issue SIGTERM to allow cleanup */
3142 sig = SIGTERM;
3143 }
3144
3145 proc_list_unlock();
3146
3147 return sig;
3148 }
3149
3150 void
3151 memorystatus_on_suspend(proc_t p)
3152 {
3153 #if CONFIG_FREEZE
3154 uint32_t pages;
3155 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
3156 #endif
3157 proc_list_lock();
3158 #if CONFIG_FREEZE
3159 p->p_memstat_suspendedfootprint = pages;
3160 memorystatus_suspended_footprint_total += pages;
3161 memorystatus_suspended_count++;
3162 #endif
3163 p->p_memstat_state |= P_MEMSTAT_SUSPENDED;
3164 proc_list_unlock();
3165 }
3166
3167 void
3168 memorystatus_on_resume(proc_t p)
3169 {
3170 #if CONFIG_FREEZE
3171 boolean_t frozen;
3172 pid_t pid;
3173 #endif
3174
3175 proc_list_lock();
3176
3177 #if CONFIG_FREEZE
3178 frozen = (p->p_memstat_state & P_MEMSTAT_FROZEN);
3179 if (frozen) {
3180 memorystatus_frozen_count--;
3181 p->p_memstat_state |= P_MEMSTAT_PRIOR_THAW;
3182 }
3183
3184 memorystatus_suspended_footprint_total -= p->p_memstat_suspendedfootprint;
3185 memorystatus_suspended_count--;
3186
3187 pid = p->p_pid;
3188 #endif
3189
3190 p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN);
3191
3192 proc_list_unlock();
3193
3194 #if CONFIG_FREEZE
3195 if (frozen) {
3196 memorystatus_freeze_entry_t data = { pid, FALSE, 0 };
3197 memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
3198 }
3199 #endif
3200 }
3201
3202 void
3203 memorystatus_on_inactivity(proc_t p)
3204 {
3205 #pragma unused(p)
3206 #if CONFIG_FREEZE
3207 /* Wake the freeze thread */
3208 thread_wakeup((event_t)&memorystatus_freeze_wakeup);
3209 #endif
3210 }
3211
3212 /*
3213 * The proc_list_lock is held by the caller.
3214 */
3215 static uint32_t
3216 memorystatus_build_state(proc_t p) {
3217 uint32_t snapshot_state = 0;
3218
3219 /* General */
3220 if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
3221 snapshot_state |= kMemorystatusSuspended;
3222 }
3223 if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
3224 snapshot_state |= kMemorystatusFrozen;
3225 }
3226 if (p->p_memstat_state & P_MEMSTAT_PRIOR_THAW) {
3227 snapshot_state |= kMemorystatusWasThawed;
3228 }
3229
3230 /* Tracking */
3231 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
3232 snapshot_state |= kMemorystatusTracked;
3233 }
3234 if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
3235 snapshot_state |= kMemorystatusSupportsIdleExit;
3236 }
3237 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
3238 snapshot_state |= kMemorystatusDirty;
3239 }
3240
3241 return snapshot_state;
3242 }
3243
3244 static boolean_t
3245 kill_idle_exit_proc(void)
3246 {
3247 proc_t p, victim_p = PROC_NULL;
3248 uint64_t current_time;
3249 boolean_t killed = FALSE;
3250 unsigned int i = 0;
3251 os_reason_t jetsam_reason = OS_REASON_NULL;
3252
3253 /* Pick next idle exit victim. */
3254 current_time = mach_absolute_time();
3255
3256 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_IDLE_EXIT);
3257 if (jetsam_reason == OS_REASON_NULL) {
3258 printf("kill_idle_exit_proc: failed to allocate jetsam reason\n");
3259 }
3260
3261 proc_list_lock();
3262
3263 p = memorystatus_get_first_proc_locked(&i, FALSE);
3264 while (p) {
3265 /* No need to look beyond the idle band */
3266 if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
3267 break;
3268 }
3269
3270 if ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT|P_DIRTY_IS_DIRTY|P_DIRTY_TERMINATED)) == (P_DIRTY_ALLOW_IDLE_EXIT)) {
3271 if (current_time >= p->p_memstat_idledeadline) {
3272 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
3273 victim_p = proc_ref_locked(p);
3274 break;
3275 }
3276 }
3277
3278 p = memorystatus_get_next_proc_locked(&i, p, FALSE);
3279 }
3280
3281 proc_list_unlock();
3282
3283 if (victim_p) {
3284 printf("memorystatus: killing_idle_process pid %d [%s]\n", victim_p->p_pid, (*victim_p->p_name ? victim_p->p_name : "unknown"));
3285 killed = memorystatus_do_kill(victim_p, kMemorystatusKilledIdleExit, jetsam_reason);
3286 proc_rele(victim_p);
3287 } else {
3288 os_reason_free(jetsam_reason);
3289 }
3290
3291 return killed;
3292 }
3293
3294 static void
3295 memorystatus_thread_wake(void) {
3296 thread_wakeup((event_t)&memorystatus_wakeup);
3297 }
3298
3299 extern void vm_pressure_response(void);
3300
3301 static int
3302 memorystatus_thread_block(uint32_t interval_ms, thread_continue_t continuation)
3303 {
3304 if (interval_ms) {
3305 assert_wait_timeout(&memorystatus_wakeup, THREAD_UNINT, interval_ms, 1000 * NSEC_PER_USEC);
3306 } else {
3307 assert_wait(&memorystatus_wakeup, THREAD_UNINT);
3308 }
3309
3310 return thread_block(continuation);
3311 }
3312
3313 static boolean_t
3314 memorystatus_avail_pages_below_pressure(void)
3315 {
3316 #if CONFIG_EMBEDDED
3317 /*
3318 * Instead of CONFIG_EMBEDDED for these *avail_pages* routines, we should
3319 * key off of the system having dynamic swap support. With full swap support,
3320 * the system shouldn't really need to worry about various page thresholds.
3321 */
3322 return (memorystatus_available_pages <= memorystatus_available_pages_pressure);
3323 #else /* CONFIG_EMBEDDED */
3324 return FALSE;
3325 #endif /* CONFIG_EMBEDDED */
3326 }
3327
3328 static boolean_t
3329 memorystatus_avail_pages_below_critical(void)
3330 {
3331 #if CONFIG_EMBEDDED
3332 return (memorystatus_available_pages <= memorystatus_available_pages_critical);
3333 #else /* CONFIG_EMBEDDED */
3334 return FALSE;
3335 #endif /* CONFIG_EMBEDDED */
3336 }
3337
3338 static boolean_t
3339 memorystatus_post_snapshot(int32_t priority, uint32_t cause)
3340 {
3341 #if CONFIG_EMBEDDED
3342 #pragma unused(cause)
3343 /*
3344 * Don't generate logs for steady-state idle-exit kills,
3345 * unless it is overridden for debug or by the device
3346 * tree.
3347 */
3348
3349 return ((priority != JETSAM_PRIORITY_IDLE) || memorystatus_idle_snapshot);
3350
3351 #else /* CONFIG_EMBEDDED */
3352 /*
3353 * Don't generate logs for steady-state idle-exit kills,
3354 * unless
3355 * - it is overridden for debug or by the device
3356 * tree.
3357 * OR
3358 * - the kill causes are important i.e. not kMemorystatusKilledIdleExit
3359 */
3360
3361 boolean_t snapshot_eligible_kill_cause = (is_reason_thrashing(cause) || is_reason_zone_map_exhaustion(cause));
3362 return ((priority != JETSAM_PRIORITY_IDLE) || memorystatus_idle_snapshot || snapshot_eligible_kill_cause);
3363 #endif /* CONFIG_EMBEDDED */
3364 }
3365
3366 static boolean_t
3367 memorystatus_action_needed(void)
3368 {
3369 #if CONFIG_EMBEDDED
3370 return (is_reason_thrashing(kill_under_pressure_cause) ||
3371 is_reason_zone_map_exhaustion(kill_under_pressure_cause) ||
3372 memorystatus_available_pages <= memorystatus_available_pages_pressure);
3373 #else /* CONFIG_EMBEDDED */
3374 return (is_reason_thrashing(kill_under_pressure_cause) ||
3375 is_reason_zone_map_exhaustion(kill_under_pressure_cause));
3376 #endif /* CONFIG_EMBEDDED */
3377 }
3378
3379 static boolean_t
3380 memorystatus_act_on_hiwat_processes(uint32_t *errors, uint32_t *hwm_kill, boolean_t *post_snapshot, __unused boolean_t *is_critical)
3381 {
3382 boolean_t killed = memorystatus_kill_hiwat_proc(errors);
3383
3384 if (killed) {
3385 *hwm_kill = *hwm_kill + 1;
3386 *post_snapshot = TRUE;
3387 return TRUE;
3388 } else {
3389 memorystatus_hwm_candidates = FALSE;
3390 }
3391
3392 #if CONFIG_JETSAM
3393 /* No highwater processes to kill. Continue or stop for now? */
3394 if (!is_reason_thrashing(kill_under_pressure_cause) &&
3395 !is_reason_zone_map_exhaustion(kill_under_pressure_cause) &&
3396 (memorystatus_available_pages > memorystatus_available_pages_critical)) {
3397 /*
3398 * We are _not_ out of pressure but we are above the critical threshold and there's:
3399 * - no compressor thrashing
3400 * - enough zone memory
3401 * - no more HWM processes left.
3402 * For now, don't kill any other processes.
3403 */
3404
3405 if (*hwm_kill == 0) {
3406 memorystatus_thread_wasted_wakeup++;
3407 }
3408
3409 *is_critical = FALSE;
3410
3411 return TRUE;
3412 }
3413 #endif /* CONFIG_JETSAM */
3414
3415 return FALSE;
3416 }
3417
3418 static boolean_t
3419 memorystatus_act_aggressive(uint32_t cause, os_reason_t jetsam_reason, int *jld_idle_kills, boolean_t *corpse_list_purged, boolean_t *post_snapshot)
3420 {
3421 if (memorystatus_jld_enabled == TRUE) {
3422
3423 boolean_t killed;
3424 uint32_t errors = 0;
3425
3426 /* Jetsam Loop Detection - locals */
3427 memstat_bucket_t *bucket;
3428 int jld_bucket_count = 0;
3429 struct timeval jld_now_tstamp = {0,0};
3430 uint64_t jld_now_msecs = 0;
3431 int elevated_bucket_count = 0;
3432
3433 /* Jetsam Loop Detection - statics */
3434 static uint64_t jld_timestamp_msecs = 0;
3435 static int jld_idle_kill_candidates = 0; /* Number of available processes in band 0,1 at start */
3436 static int jld_eval_aggressive_count = 0; /* Bumps the max priority in aggressive loop */
3437 static int32_t jld_priority_band_max = JETSAM_PRIORITY_UI_SUPPORT;
3438 /*
3439 * Jetsam Loop Detection: attempt to detect
3440 * rapid daemon relaunches in the lower bands.
3441 */
3442
3443 microuptime(&jld_now_tstamp);
3444
3445 /*
3446 * Ignore usecs in this calculation.
3447 * msecs granularity is close enough.
3448 */
3449 jld_now_msecs = (jld_now_tstamp.tv_sec * 1000);
3450
3451 proc_list_lock();
3452 switch (jetsam_aging_policy) {
3453 case kJetsamAgingPolicyLegacy:
3454 bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
3455 jld_bucket_count = bucket->count;
3456 bucket = &memstat_bucket[JETSAM_PRIORITY_AGING_BAND1];
3457 jld_bucket_count += bucket->count;
3458 break;
3459 case kJetsamAgingPolicySysProcsReclaimedFirst:
3460 case kJetsamAgingPolicyAppsReclaimedFirst:
3461 bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
3462 jld_bucket_count = bucket->count;
3463 bucket = &memstat_bucket[system_procs_aging_band];
3464 jld_bucket_count += bucket->count;
3465 bucket = &memstat_bucket[applications_aging_band];
3466 jld_bucket_count += bucket->count;
3467 break;
3468 case kJetsamAgingPolicyNone:
3469 default:
3470 bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
3471 jld_bucket_count = bucket->count;
3472 break;
3473 }
3474
3475 bucket = &memstat_bucket[JETSAM_PRIORITY_ELEVATED_INACTIVE];
3476 elevated_bucket_count = bucket->count;
3477
3478 proc_list_unlock();
3479
3480 /*
3481 * memorystatus_jld_eval_period_msecs is a tunable
3482 * memorystatus_jld_eval_aggressive_count is a tunable
3483 * memorystatus_jld_eval_aggressive_priority_band_max is a tunable
3484 */
3485 if ( (jld_bucket_count == 0) ||
3486 (jld_now_msecs > (jld_timestamp_msecs + memorystatus_jld_eval_period_msecs))) {
3487
3488 /*
3489 * Refresh evaluation parameters
3490 */
3491 jld_timestamp_msecs = jld_now_msecs;
3492 jld_idle_kill_candidates = jld_bucket_count;
3493 *jld_idle_kills = 0;
3494 jld_eval_aggressive_count = 0;
3495 jld_priority_band_max = JETSAM_PRIORITY_UI_SUPPORT;
3496 }
3497
3498 if (*jld_idle_kills > jld_idle_kill_candidates) {
3499 jld_eval_aggressive_count++;
3500
3501 #if DEVELOPMENT || DEBUG
3502 printf("memorystatus: aggressive%d: beginning of window: %lld ms, : timestamp now: %lld ms\n",
3503 jld_eval_aggressive_count,
3504 jld_timestamp_msecs,
3505 jld_now_msecs);
3506 printf("memorystatus: aggressive%d: idle candidates: %d, idle kills: %d\n",
3507 jld_eval_aggressive_count,
3508 jld_idle_kill_candidates,
3509 *jld_idle_kills);
3510 #endif /* DEVELOPMENT || DEBUG */
3511
3512 if ((jld_eval_aggressive_count == memorystatus_jld_eval_aggressive_count) &&
3513 (total_corpses_count() > 0) && (*corpse_list_purged == FALSE)) {
3514 /*
3515 * If we reach this aggressive cycle, corpses might be causing memory pressure.
3516 * So, in an effort to avoid jetsams in the FG band, we will attempt to purge
3517 * corpse memory prior to this final march through JETSAM_PRIORITY_UI_SUPPORT.
3518 */
3519 task_purge_all_corpses();
3520 *corpse_list_purged = TRUE;
3521 }
3522 else if (jld_eval_aggressive_count > memorystatus_jld_eval_aggressive_count) {
3523 /*
3524 * Bump up the jetsam priority limit (eg: the bucket index)
3525 * Enforce bucket index sanity.
3526 */
3527 if ((memorystatus_jld_eval_aggressive_priority_band_max < 0) ||
3528 (memorystatus_jld_eval_aggressive_priority_band_max >= MEMSTAT_BUCKET_COUNT)) {
3529 /*
3530 * Do nothing. Stick with the default level.
3531 */
3532 } else {
3533 jld_priority_band_max = memorystatus_jld_eval_aggressive_priority_band_max;
3534 }
3535 }
3536
3537 /* Visit elevated processes first */
3538 while (elevated_bucket_count) {
3539
3540 elevated_bucket_count--;
3541
3542 /*
3543 * memorystatus_kill_elevated_process() drops a reference,
3544 * so take another one so we can continue to use this exit reason
3545 * even after it returns.
3546 */
3547
3548 os_reason_ref(jetsam_reason);
3549 killed = memorystatus_kill_elevated_process(
3550 cause,
3551 jetsam_reason,
3552 jld_eval_aggressive_count,
3553 &errors);
3554
3555 if (killed) {
3556 *post_snapshot = TRUE;
3557 if (memorystatus_avail_pages_below_pressure()) {
3558 /*
3559 * Still under pressure.
3560 * Find another pinned processes.
3561 */
3562 continue;
3563 } else {
3564 return TRUE;
3565 }
3566 } else {
3567 /*
3568 * No pinned processes left to kill.
3569 * Abandon elevated band.
3570 */
3571 break;
3572 }
3573 }
3574
3575 /*
3576 * memorystatus_kill_top_process_aggressive() allocates its own
3577 * jetsam_reason so the kMemorystatusKilledVMThrashing cause
3578 * is consistent throughout the aggressive march.
3579 */
3580 killed = memorystatus_kill_top_process_aggressive(
3581 kMemorystatusKilledVMThrashing,
3582 jld_eval_aggressive_count,
3583 jld_priority_band_max,
3584 &errors);
3585
3586 if (killed) {
3587 /* Always generate logs after aggressive kill */
3588 *post_snapshot = TRUE;
3589 *jld_idle_kills = 0;
3590 return TRUE;
3591 }
3592 }
3593
3594 return FALSE;
3595 }
3596
3597 return FALSE;
3598 }
3599
3600
3601 static void
3602 memorystatus_thread(void *param __unused, wait_result_t wr __unused)
3603 {
3604 static boolean_t is_vm_privileged = FALSE;
3605
3606 boolean_t post_snapshot = FALSE;
3607 uint32_t errors = 0;
3608 uint32_t hwm_kill = 0;
3609 boolean_t sort_flag = TRUE;
3610 boolean_t corpse_list_purged = FALSE;
3611 int jld_idle_kills = 0;
3612
3613 if (is_vm_privileged == FALSE) {
3614 /*
3615 * It's the first time the thread has run, so just mark the thread as privileged and block.
3616 * This avoids a spurious pass with unset variables, as set out in <rdar://problem/9609402>.
3617 */
3618 thread_wire(host_priv_self(), current_thread(), TRUE);
3619 is_vm_privileged = TRUE;
3620
3621 if (vm_restricted_to_single_processor == TRUE)
3622 thread_vm_bind_group_add();
3623 thread_set_thread_name(current_thread(), "VM_memorystatus");
3624 memorystatus_thread_block(0, memorystatus_thread);
3625 }
3626
3627 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_START,
3628 memorystatus_available_pages, memorystatus_jld_enabled, memorystatus_jld_eval_period_msecs, memorystatus_jld_eval_aggressive_count,0);
3629
3630 /*
3631 * Jetsam aware version.
3632 *
3633 * The VM pressure notification thread is working it's way through clients in parallel.
3634 *
3635 * So, while the pressure notification thread is targeting processes in order of
3636 * increasing jetsam priority, we can hopefully reduce / stop it's work by killing
3637 * any processes that have exceeded their highwater mark.
3638 *
3639 * If we run out of HWM processes and our available pages drops below the critical threshold, then,
3640 * we target the least recently used process in order of increasing jetsam priority (exception: the FG band).
3641 */
3642 while (memorystatus_action_needed()) {
3643 boolean_t killed;
3644 int32_t priority;
3645 uint32_t cause;
3646 uint64_t jetsam_reason_code = JETSAM_REASON_INVALID;
3647 os_reason_t jetsam_reason = OS_REASON_NULL;
3648
3649 cause = kill_under_pressure_cause;
3650 switch (cause) {
3651 case kMemorystatusKilledFCThrashing:
3652 jetsam_reason_code = JETSAM_REASON_MEMORY_FCTHRASHING;
3653 break;
3654 case kMemorystatusKilledVMThrashing:
3655 jetsam_reason_code = JETSAM_REASON_MEMORY_VMTHRASHING;
3656 break;
3657 case kMemorystatusKilledZoneMapExhaustion:
3658 jetsam_reason_code = JETSAM_REASON_ZONE_MAP_EXHAUSTION;
3659 break;
3660 case kMemorystatusKilledVMPageShortage:
3661 /* falls through */
3662 default:
3663 jetsam_reason_code = JETSAM_REASON_MEMORY_VMPAGESHORTAGE;
3664 cause = kMemorystatusKilledVMPageShortage;
3665 break;
3666 }
3667
3668 /* Highwater */
3669 boolean_t is_critical = TRUE;
3670 if (memorystatus_act_on_hiwat_processes(&errors, &hwm_kill, &post_snapshot, &is_critical)) {
3671 if (is_critical == FALSE) {
3672 /*
3673 * For now, don't kill any other processes.
3674 */
3675 break;
3676 } else {
3677 goto done;
3678 }
3679 }
3680
3681 jetsam_reason = os_reason_create(OS_REASON_JETSAM, jetsam_reason_code);
3682 if (jetsam_reason == OS_REASON_NULL) {
3683 printf("memorystatus_thread: failed to allocate jetsam reason\n");
3684 }
3685
3686 if (memorystatus_act_aggressive(cause, jetsam_reason, &jld_idle_kills, &corpse_list_purged, &post_snapshot)) {
3687 goto done;
3688 }
3689
3690 /*
3691 * memorystatus_kill_top_process() drops a reference,
3692 * so take another one so we can continue to use this exit reason
3693 * even after it returns
3694 */
3695 os_reason_ref(jetsam_reason);
3696
3697 /* LRU */
3698 killed = memorystatus_kill_top_process(TRUE, sort_flag, cause, jetsam_reason, &priority, &errors);
3699 sort_flag = FALSE;
3700
3701 if (killed) {
3702 if (memorystatus_post_snapshot(priority, cause) == TRUE) {
3703
3704 post_snapshot = TRUE;
3705 }
3706
3707 /* Jetsam Loop Detection */
3708 if (memorystatus_jld_enabled == TRUE) {
3709 if ((priority == JETSAM_PRIORITY_IDLE) || (priority == system_procs_aging_band) || (priority == applications_aging_band)) {
3710 jld_idle_kills++;
3711 } else {
3712 /*
3713 * We've reached into bands beyond idle deferred.
3714 * We make no attempt to monitor them
3715 */
3716 }
3717 }
3718
3719 if ((priority >= JETSAM_PRIORITY_UI_SUPPORT) && (total_corpses_count() > 0) && (corpse_list_purged == FALSE)) {
3720 /*
3721 * If we have jetsammed a process in or above JETSAM_PRIORITY_UI_SUPPORT
3722 * then we attempt to relieve pressure by purging corpse memory.
3723 */
3724 task_purge_all_corpses();
3725 corpse_list_purged = TRUE;
3726 }
3727 goto done;
3728 }
3729
3730 if (memorystatus_avail_pages_below_critical()) {
3731 /*
3732 * Still under pressure and unable to kill a process - purge corpse memory
3733 */
3734 if (total_corpses_count() > 0) {
3735 task_purge_all_corpses();
3736 corpse_list_purged = TRUE;
3737 }
3738
3739 if (memorystatus_avail_pages_below_critical()) {
3740 /*
3741 * Still under pressure and unable to kill a process - panic
3742 */
3743 panic("memorystatus_jetsam_thread: no victim! available pages:%llu\n", (uint64_t)memorystatus_available_pages);
3744 }
3745 }
3746
3747 done:
3748
3749 /*
3750 * We do not want to over-kill when thrashing has been detected.
3751 * To avoid that, we reset the flag here and notify the
3752 * compressor.
3753 */
3754 if (is_reason_thrashing(kill_under_pressure_cause)) {
3755 kill_under_pressure_cause = 0;
3756 #if CONFIG_JETSAM
3757 vm_thrashing_jetsam_done();
3758 #endif /* CONFIG_JETSAM */
3759 } else if (is_reason_zone_map_exhaustion(kill_under_pressure_cause)) {
3760 kill_under_pressure_cause = 0;
3761 }
3762
3763 os_reason_free(jetsam_reason);
3764 }
3765
3766 kill_under_pressure_cause = 0;
3767
3768 if (errors) {
3769 memorystatus_clear_errors();
3770 }
3771
3772 if (post_snapshot) {
3773 proc_list_lock();
3774 size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
3775 sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count);
3776 uint64_t timestamp_now = mach_absolute_time();
3777 memorystatus_jetsam_snapshot->notification_time = timestamp_now;
3778 memorystatus_jetsam_snapshot->js_gencount++;
3779 if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
3780 timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
3781 proc_list_unlock();
3782 int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
3783 if (!ret) {
3784 proc_list_lock();
3785 memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
3786 proc_list_unlock();
3787 }
3788 } else {
3789 proc_list_unlock();
3790 }
3791 }
3792
3793 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_END,
3794 memorystatus_available_pages, 0, 0, 0, 0);
3795
3796 memorystatus_thread_block(0, memorystatus_thread);
3797 }
3798
3799 /*
3800 * Returns TRUE:
3801 * when an idle-exitable proc was killed
3802 * Returns FALSE:
3803 * when there are no more idle-exitable procs found
3804 * when the attempt to kill an idle-exitable proc failed
3805 */
3806 boolean_t memorystatus_idle_exit_from_VM(void) {
3807
3808 /*
3809 * This routine should no longer be needed since we are
3810 * now using jetsam bands on all platforms and so will deal
3811 * with IDLE processes within the memorystatus thread itself.
3812 *
3813 * But we still use it because we observed that macos systems
3814 * started heavy compression/swapping with a bunch of
3815 * idle-exitable processes alive and doing nothing. We decided
3816 * to rather kill those processes than start swapping earlier.
3817 */
3818
3819 return(kill_idle_exit_proc());
3820 }
3821
3822 /*
3823 * Callback invoked when allowable physical memory footprint exceeded
3824 * (dirty pages + IOKit mappings)
3825 *
3826 * This is invoked for both advisory, non-fatal per-task high watermarks,
3827 * as well as the fatal task memory limits.
3828 */
3829 void
3830 memorystatus_on_ledger_footprint_exceeded(boolean_t warning, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal)
3831 {
3832 os_reason_t jetsam_reason = OS_REASON_NULL;
3833
3834 proc_t p = current_proc();
3835
3836 #if VM_PRESSURE_EVENTS
3837 if (warning == TRUE) {
3838 /*
3839 * This is a warning path which implies that the current process is close, but has
3840 * not yet exceeded its per-process memory limit.
3841 */
3842 if (memorystatus_warn_process(p->p_pid, memlimit_is_active, memlimit_is_fatal, FALSE /* not exceeded */) != TRUE) {
3843 /* Print warning, since it's possible that task has not registered for pressure notifications */
3844 os_log(OS_LOG_DEFAULT, "memorystatus_on_ledger_footprint_exceeded: failed to warn the current task (%d exiting, or no handler registered?).\n", p->p_pid);
3845 }
3846 return;
3847 }
3848 #endif /* VM_PRESSURE_EVENTS */
3849
3850 if (memlimit_is_fatal) {
3851 /*
3852 * If this process has no high watermark or has a fatal task limit, then we have been invoked because the task
3853 * has violated either the system-wide per-task memory limit OR its own task limit.
3854 */
3855 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_PERPROCESSLIMIT);
3856 if (jetsam_reason == NULL) {
3857 printf("task_exceeded footprint: failed to allocate jetsam reason\n");
3858 } else if (corpse_for_fatal_memkill != 0) {
3859 /* Set OS_REASON_FLAG_GENERATE_CRASH_REPORT to generate corpse */
3860 jetsam_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
3861 }
3862
3863 if (memorystatus_kill_process_sync(p->p_pid, kMemorystatusKilledPerProcessLimit, jetsam_reason) != TRUE) {
3864 printf("task_exceeded_footprint: failed to kill the current task (exiting?).\n");
3865 }
3866 } else {
3867 /*
3868 * HWM offender exists. Done without locks or synchronization.
3869 * See comment near its declaration for more details.
3870 */
3871 memorystatus_hwm_candidates = TRUE;
3872
3873 #if VM_PRESSURE_EVENTS
3874 /*
3875 * The current process is not in the warning path.
3876 * This path implies the current process has exceeded a non-fatal (soft) memory limit.
3877 * Failure to send note is ignored here.
3878 */
3879 (void)memorystatus_warn_process(p->p_pid, memlimit_is_active, memlimit_is_fatal, TRUE /* exceeded */);
3880
3881 #endif /* VM_PRESSURE_EVENTS */
3882 }
3883 }
3884
3885 void
3886 memorystatus_log_exception(const int max_footprint_mb, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal)
3887 {
3888 proc_t p = current_proc();
3889
3890 /*
3891 * The limit violation is logged here, but only once per process per limit.
3892 * Soft memory limit is a non-fatal high-water-mark
3893 * Hard memory limit is a fatal custom-task-limit or system-wide per-task memory limit.
3894 */
3895
3896 os_log_with_startup_serial(OS_LOG_DEFAULT, "EXC_RESOURCE -> %s[%d] exceeded mem limit: %s%s %d MB (%s)\n",
3897 (*p->p_name ? p->p_name : "unknown"), p->p_pid, (memlimit_is_active ? "Active" : "Inactive"),
3898 (memlimit_is_fatal ? "Hard" : "Soft"), max_footprint_mb,
3899 (memlimit_is_fatal ? "fatal" : "non-fatal"));
3900
3901 return;
3902 }
3903
3904
3905 /*
3906 * Description:
3907 * Evaluates process state to determine which limit
3908 * should be applied (active vs. inactive limit).
3909 *
3910 * Processes that have the 'elevated inactive jetsam band' attribute
3911 * are first evaluated based on their current priority band.
3912 * presently elevated ==> active
3913 *
3914 * Processes that opt into dirty tracking are evaluated
3915 * based on clean vs dirty state.
3916 * dirty ==> active
3917 * clean ==> inactive
3918 *
3919 * Process that do not opt into dirty tracking are
3920 * evalulated based on priority level.
3921 * Foreground or above ==> active
3922 * Below Foreground ==> inactive
3923 *
3924 * Return: TRUE if active
3925 * False if inactive
3926 */
3927
3928 static boolean_t
3929 proc_jetsam_state_is_active_locked(proc_t p) {
3930
3931 if ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) &&
3932 (p->p_memstat_effectivepriority == JETSAM_PRIORITY_ELEVATED_INACTIVE)) {
3933 /*
3934 * process has the 'elevated inactive jetsam band' attribute
3935 * and process is present in the elevated band
3936 * implies active state
3937 */
3938 return TRUE;
3939 } else if (p->p_memstat_dirty & P_DIRTY_TRACK) {
3940 /*
3941 * process has opted into dirty tracking
3942 * active state is based on dirty vs. clean
3943 */
3944 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
3945 /*
3946 * process is dirty
3947 * implies active state
3948 */
3949 return TRUE;
3950 } else {
3951 /*
3952 * process is clean
3953 * implies inactive state
3954 */
3955 return FALSE;
3956 }
3957 } else if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) {
3958 /*
3959 * process is Foreground or higher
3960 * implies active state
3961 */
3962 return TRUE;
3963 } else {
3964 /*
3965 * process found below Foreground
3966 * implies inactive state
3967 */
3968 return FALSE;
3969 }
3970 }
3971
3972 static boolean_t
3973 memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason) {
3974 boolean_t res;
3975
3976 uint32_t errors = 0;
3977
3978 if (victim_pid == -1) {
3979 /* No pid, so kill first process */
3980 res = memorystatus_kill_top_process(TRUE, TRUE, cause, jetsam_reason, NULL, &errors);
3981 } else {
3982 res = memorystatus_kill_specific_process(victim_pid, cause, jetsam_reason);
3983 }
3984
3985 if (errors) {
3986 memorystatus_clear_errors();
3987 }
3988
3989 if (res == TRUE) {
3990 /* Fire off snapshot notification */
3991 proc_list_lock();
3992 size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
3993 sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_count;
3994 uint64_t timestamp_now = mach_absolute_time();
3995 memorystatus_jetsam_snapshot->notification_time = timestamp_now;
3996 if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
3997 timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
3998 proc_list_unlock();
3999 int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
4000 if (!ret) {
4001 proc_list_lock();
4002 memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
4003 proc_list_unlock();
4004 }
4005 } else {
4006 proc_list_unlock();
4007 }
4008 }
4009
4010 return res;
4011 }
4012
4013 /*
4014 * Jetsam a specific process.
4015 */
4016 static boolean_t
4017 memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason) {
4018 boolean_t killed;
4019 proc_t p;
4020 uint64_t killtime = 0;
4021 clock_sec_t tv_sec;
4022 clock_usec_t tv_usec;
4023 uint32_t tv_msec;
4024
4025 /* TODO - add a victim queue and push this into the main jetsam thread */
4026
4027 p = proc_find(victim_pid);
4028 if (!p) {
4029 os_reason_free(jetsam_reason);
4030 return FALSE;
4031 }
4032
4033 proc_list_lock();
4034
4035 if (memorystatus_jetsam_snapshot_count == 0) {
4036 memorystatus_init_jetsam_snapshot_locked(NULL,0);
4037 }
4038
4039 killtime = mach_absolute_time();
4040 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
4041 tv_msec = tv_usec / 1000;
4042
4043 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
4044
4045 proc_list_unlock();
4046
4047 os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: killing_specific_process pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n",
4048 (unsigned long)tv_sec, tv_msec, victim_pid, (*p->p_name ? p->p_name : "unknown"),
4049 memorystatus_kill_cause_name[cause], p->p_memstat_effectivepriority, (uint64_t)memorystatus_available_pages);
4050
4051 killed = memorystatus_do_kill(p, cause, jetsam_reason);
4052 proc_rele(p);
4053
4054 return killed;
4055 }
4056
4057
4058 /*
4059 * Toggle the P_MEMSTAT_TERMINATED state.
4060 * Takes the proc_list_lock.
4061 */
4062 void
4063 proc_memstat_terminated(proc_t p, boolean_t set)
4064 {
4065 #if DEVELOPMENT || DEBUG
4066 if (p) {
4067 proc_list_lock();
4068 if (set == TRUE) {
4069 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
4070 } else {
4071 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
4072 }
4073 proc_list_unlock();
4074 }
4075 #else
4076 #pragma unused(p, set)
4077 /*
4078 * do nothing
4079 */
4080 #endif /* DEVELOPMENT || DEBUG */
4081 return;
4082 }
4083
4084
4085 #if CONFIG_JETSAM
4086 /*
4087 * This is invoked when cpulimits have been exceeded while in fatal mode.
4088 * The jetsam_flags do not apply as those are for memory related kills.
4089 * We call this routine so that the offending process is killed with
4090 * a non-zero exit status.
4091 */
4092 void
4093 jetsam_on_ledger_cpulimit_exceeded(void)
4094 {
4095 int retval = 0;
4096 int jetsam_flags = 0; /* make it obvious */
4097 proc_t p = current_proc();
4098 os_reason_t jetsam_reason = OS_REASON_NULL;
4099
4100 printf("task_exceeded_cpulimit: killing pid %d [%s]\n",
4101 p->p_pid, (*p->p_name ? p->p_name : "(unknown)"));
4102
4103 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_CPULIMIT);
4104 if (jetsam_reason == OS_REASON_NULL) {
4105 printf("task_exceeded_cpulimit: unable to allocate memory for jetsam reason\n");
4106 }
4107
4108 retval = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
4109
4110 if (retval) {
4111 printf("task_exceeded_cpulimit: failed to kill current task (exiting?).\n");
4112 }
4113 }
4114
4115 #endif /* CONFIG_JETSAM */
4116
4117 static void
4118 memorystatus_get_task_memory_region_count(task_t task, uint64_t *count)
4119 {
4120 assert(task);
4121 assert(count);
4122
4123 *count = get_task_memory_region_count(task);
4124 }
4125
4126 #if DEVELOPMENT || DEBUG
4127
4128 /*
4129 * Sysctl only used to test memorystatus_allowed_vm_map_fork() path.
4130 * set a new pidwatch value
4131 * or
4132 * get the current pidwatch value
4133 */
4134
4135 uint64_t memorystatus_vm_map_fork_pidwatch_val = 0;
4136 #define MEMORYSTATUS_VM_MAP_FORK_ALLOWED 0x100000000
4137 #define MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED 0x200000000
4138
4139 static int sysctl_memorystatus_vm_map_fork_pidwatch SYSCTL_HANDLER_ARGS {
4140 #pragma unused(oidp, arg1, arg2)
4141
4142 uint64_t new_value = 0;
4143 uint64_t old_value = 0;
4144 int error = 0;
4145
4146 /*
4147 * The pid is held in the low 32 bits.
4148 * The 'allowed' flags are in the upper 32 bits.
4149 */
4150 old_value = memorystatus_vm_map_fork_pidwatch_val;
4151
4152 error = sysctl_io_number(req, old_value, sizeof(old_value), &new_value, NULL);
4153
4154 if (error || !req->newptr) {
4155 /*
4156 * No new value passed in.
4157 */
4158 return(error);
4159 }
4160
4161 /*
4162 * A new pid was passed in via req->newptr.
4163 * Ignore any attempt to set the higher order bits.
4164 */
4165 memorystatus_vm_map_fork_pidwatch_val = new_value & 0xFFFFFFFF;
4166 printf("memorystatus: pidwatch old_value = 0x%llx, new_value = 0x%llx \n", old_value, new_value);
4167
4168 return(error);
4169 }
4170
4171 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_map_fork_pidwatch, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED| CTLFLAG_MASKED,
4172 0, 0, sysctl_memorystatus_vm_map_fork_pidwatch, "Q", "get/set pid watched for in vm_map_fork");
4173
4174
4175 #define SET_VM_MAP_FORK_PIDWATCH_ALLOWED(task) \
4176 MACRO_BEGIN \
4177 if (memorystatus_vm_map_fork_pidwatch_val != 0) { \
4178 proc_t p = get_bsdtask_info(task); \
4179 if (p && (memorystatus_vm_map_fork_pidwatch_val == (uint64_t)p->p_pid)) { \
4180 memorystatus_vm_map_fork_pidwatch_val |= MEMORYSTATUS_VM_MAP_FORK_ALLOWED; \
4181 } \
4182 } \
4183 MACRO_END
4184
4185 #define SET_VM_MAP_FORK_PIDWATCH_NOT_ALLOWED(task) \
4186 MACRO_BEGIN \
4187 if (memorystatus_vm_map_fork_pidwatch_val != 0) { \
4188 proc_t p = get_bsdtask_info(task); \
4189 if (p && (memorystatus_vm_map_fork_pidwatch_val == (uint64_t)p->p_pid)) { \
4190 memorystatus_vm_map_fork_pidwatch_val |= MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED; \
4191 } \
4192 } \
4193 MACRO_END
4194
4195 #else /* DEVELOPMENT || DEBUG */
4196
4197 #define SET_VM_MAP_FORK_PIDWATCH_ALLOWED(task)
4198 #define SET_VM_MAP_FORK_PIDWATCH_NOT_ALLOWED(task)
4199
4200 #endif /* DEVELOPMENT || DEBUG */
4201
4202 /*
4203 * Called during EXC_RESOURCE handling when a process exceeds a soft
4204 * memory limit. This is the corpse fork path and here we decide if
4205 * vm_map_fork will be allowed when creating the corpse.
4206 * The task being considered is suspended.
4207 *
4208 * By default, a vm_map_fork is allowed to proceed.
4209 *
4210 * A few simple policy assumptions:
4211 * Desktop platform is not considered in this path.
4212 * The vm_map_fork is always allowed.
4213 *
4214 * If the device has a zero system-wide task limit,
4215 * then the vm_map_fork is allowed.
4216 *
4217 * And if a process's memory footprint calculates less
4218 * than or equal to half of the system-wide task limit,
4219 * then the vm_map_fork is allowed. This calculation
4220 * is based on the assumption that a process can
4221 * munch memory up to the system-wide task limit.
4222 */
4223 boolean_t
4224 memorystatus_allowed_vm_map_fork(__unused task_t task)
4225 {
4226 boolean_t is_allowed = TRUE; /* default */
4227
4228 #if CONFIG_EMBEDDED
4229
4230 uint64_t footprint_in_bytes = 0;
4231 uint64_t purgeable_in_bytes = 0;
4232 uint64_t max_allowed_bytes = 0;
4233
4234 if (max_task_footprint_mb == 0) {
4235 SET_VM_MAP_FORK_PIDWATCH_ALLOWED(task);
4236 return (is_allowed);
4237 }
4238
4239 purgeable_in_bytes = get_task_purgeable_size(task);
4240 footprint_in_bytes = get_task_phys_footprint(task);
4241
4242 /*
4243 * Maximum is half the system-wide task limit.
4244 */
4245 max_allowed_bytes = ((((uint64_t)max_task_footprint_mb) * 1024ULL * 1024ULL) >> 1);
4246
4247 if (footprint_in_bytes > purgeable_in_bytes) {
4248 footprint_in_bytes -= purgeable_in_bytes;
4249 }
4250
4251 if (footprint_in_bytes <= max_allowed_bytes) {
4252 SET_VM_MAP_FORK_PIDWATCH_ALLOWED(task);
4253 return (is_allowed);
4254 } else {
4255 printf("memorystatus disallowed vm_map_fork %lld %lld\n", footprint_in_bytes, max_allowed_bytes);
4256 SET_VM_MAP_FORK_PIDWATCH_NOT_ALLOWED(task);
4257 return (!is_allowed);
4258 }
4259
4260 #else /* CONFIG_EMBEDDED */
4261
4262 SET_VM_MAP_FORK_PIDWATCH_ALLOWED(task);
4263 return (is_allowed);
4264
4265 #endif /* CONFIG_EMBEDDED */
4266
4267 }
4268
4269 static void
4270 memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages)
4271 {
4272 assert(task);
4273 assert(footprint);
4274
4275 uint64_t pages;
4276
4277 pages = (get_task_phys_footprint(task) / PAGE_SIZE_64);
4278 assert(((uint32_t)pages) == pages);
4279 *footprint = (uint32_t)pages;
4280
4281 if (max_footprint) {
4282 pages = (get_task_phys_footprint_recent_max(task) / PAGE_SIZE_64);
4283 assert(((uint32_t)pages) == pages);
4284 *max_footprint = (uint32_t)pages;
4285 }
4286 if (max_footprint_lifetime) {
4287 pages = (get_task_resident_max(task) / PAGE_SIZE_64);
4288 assert(((uint32_t)pages) == pages);
4289 *max_footprint_lifetime = (uint32_t)pages;
4290 }
4291 if (purgeable_pages) {
4292 pages = (get_task_purgeable_size(task) / PAGE_SIZE_64);
4293 assert(((uint32_t)pages) == pages);
4294 *purgeable_pages = (uint32_t)pages;
4295 }
4296 }
4297
4298 static void
4299 memorystatus_get_task_phys_footprint_page_counts(task_t task,
4300 uint64_t *internal_pages, uint64_t *internal_compressed_pages,
4301 uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages,
4302 uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages,
4303 uint64_t *iokit_mapped_pages, uint64_t *page_table_pages)
4304 {
4305 assert(task);
4306
4307 if (internal_pages) {
4308 *internal_pages = (get_task_internal(task) / PAGE_SIZE_64);
4309 }
4310
4311 if (internal_compressed_pages) {
4312 *internal_compressed_pages = (get_task_internal_compressed(task) / PAGE_SIZE_64);
4313 }
4314
4315 if (purgeable_nonvolatile_pages) {
4316 *purgeable_nonvolatile_pages = (get_task_purgeable_nonvolatile(task) / PAGE_SIZE_64);
4317 }
4318
4319 if (purgeable_nonvolatile_compressed_pages) {
4320 *purgeable_nonvolatile_compressed_pages = (get_task_purgeable_nonvolatile_compressed(task) / PAGE_SIZE_64);
4321 }
4322
4323 if (alternate_accounting_pages) {
4324 *alternate_accounting_pages = (get_task_alternate_accounting(task) / PAGE_SIZE_64);
4325 }
4326
4327 if (alternate_accounting_compressed_pages) {
4328 *alternate_accounting_compressed_pages = (get_task_alternate_accounting_compressed(task) / PAGE_SIZE_64);
4329 }
4330
4331 if (iokit_mapped_pages) {
4332 *iokit_mapped_pages = (get_task_iokit_mapped(task) / PAGE_SIZE_64);
4333 }
4334
4335 if (page_table_pages) {
4336 *page_table_pages = (get_task_page_table(task) / PAGE_SIZE_64);
4337 }
4338 }
4339
4340 /*
4341 * This routine only acts on the global jetsam event snapshot.
4342 * Updating the process's entry can race when the memorystatus_thread
4343 * has chosen to kill a process that is racing to exit on another core.
4344 */
4345 static void
4346 memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime)
4347 {
4348 memorystatus_jetsam_snapshot_entry_t *entry = NULL;
4349 memorystatus_jetsam_snapshot_t *snapshot = NULL;
4350 memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
4351
4352 unsigned int i;
4353
4354 if (memorystatus_jetsam_snapshot_count == 0) {
4355 /*
4356 * No active snapshot.
4357 * Nothing to do.
4358 */
4359 return;
4360 }
4361
4362 /*
4363 * Sanity check as this routine should only be called
4364 * from a jetsam kill path.
4365 */
4366 assert(kill_cause != 0 && killtime != 0);
4367
4368 snapshot = memorystatus_jetsam_snapshot;
4369 snapshot_list = memorystatus_jetsam_snapshot->entries;
4370
4371 for (i = 0; i < memorystatus_jetsam_snapshot_count; i++) {
4372 if (snapshot_list[i].pid == p->p_pid) {
4373
4374 entry = &snapshot_list[i];
4375
4376 if (entry->killed || entry->jse_killtime) {
4377 /*
4378 * We apparently raced on the exit path
4379 * for this process, as it's snapshot entry
4380 * has already recorded a kill.
4381 */
4382 assert(entry->killed && entry->jse_killtime);
4383 break;
4384 }
4385
4386 /*
4387 * Update the entry we just found in the snapshot.
4388 */
4389
4390 entry->killed = kill_cause;
4391 entry->jse_killtime = killtime;
4392 entry->jse_gencount = snapshot->js_gencount;
4393 entry->jse_idle_delta = p->p_memstat_idle_delta;
4394
4395 /*
4396 * If a process has moved between bands since snapshot was
4397 * initialized, then likely these fields changed too.
4398 */
4399 if (entry->priority != p->p_memstat_effectivepriority) {
4400
4401 strlcpy(entry->name, p->p_name, sizeof(entry->name));
4402 entry->priority = p->p_memstat_effectivepriority;
4403 entry->state = memorystatus_build_state(p);
4404 entry->user_data = p->p_memstat_userdata;
4405 entry->fds = p->p_fd->fd_nfiles;
4406 }
4407
4408 /*
4409 * Always update the page counts on a kill.
4410 */
4411
4412 uint32_t pages = 0;
4413 uint32_t max_pages = 0;
4414 uint32_t max_pages_lifetime = 0;
4415 uint32_t purgeable_pages = 0;
4416
4417 memorystatus_get_task_page_counts(p->task, &pages, &max_pages, &max_pages_lifetime, &purgeable_pages);
4418 entry->pages = (uint64_t)pages;
4419 entry->max_pages = (uint64_t)max_pages;
4420 entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
4421 entry->purgeable_pages = (uint64_t)purgeable_pages;
4422
4423 uint64_t internal_pages = 0;
4424 uint64_t internal_compressed_pages = 0;
4425 uint64_t purgeable_nonvolatile_pages = 0;
4426 uint64_t purgeable_nonvolatile_compressed_pages = 0;
4427 uint64_t alternate_accounting_pages = 0;
4428 uint64_t alternate_accounting_compressed_pages = 0;
4429 uint64_t iokit_mapped_pages = 0;
4430 uint64_t page_table_pages = 0;
4431
4432 memorystatus_get_task_phys_footprint_page_counts(p->task, &internal_pages, &internal_compressed_pages,
4433 &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages,
4434 &alternate_accounting_pages, &alternate_accounting_compressed_pages,
4435 &iokit_mapped_pages, &page_table_pages);
4436
4437 entry->jse_internal_pages = internal_pages;
4438 entry->jse_internal_compressed_pages = internal_compressed_pages;
4439 entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages;
4440 entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages;
4441 entry->jse_alternate_accounting_pages = alternate_accounting_pages;
4442 entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages;
4443 entry->jse_iokit_mapped_pages = iokit_mapped_pages;
4444 entry->jse_page_table_pages = page_table_pages;
4445
4446 uint64_t region_count = 0;
4447 memorystatus_get_task_memory_region_count(p->task, &region_count);
4448 entry->jse_memory_region_count = region_count;
4449
4450 goto exit;
4451 }
4452 }
4453
4454 if (entry == NULL) {
4455 /*
4456 * The entry was not found in the snapshot, so the process must have
4457 * launched after the snapshot was initialized.
4458 * Let's try to append the new entry.
4459 */
4460 if (memorystatus_jetsam_snapshot_count < memorystatus_jetsam_snapshot_max) {
4461 /*
4462 * A populated snapshot buffer exists
4463 * and there is room to init a new entry.
4464 */
4465 assert(memorystatus_jetsam_snapshot_count == snapshot->entry_count);
4466
4467 unsigned int next = memorystatus_jetsam_snapshot_count;
4468
4469 if(memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[next], (snapshot->js_gencount)) == TRUE) {
4470
4471 entry = &snapshot_list[next];
4472 entry->killed = kill_cause;
4473 entry->jse_killtime = killtime;
4474
4475 snapshot->entry_count = ++next;
4476 memorystatus_jetsam_snapshot_count = next;
4477
4478 if (memorystatus_jetsam_snapshot_count >= memorystatus_jetsam_snapshot_max) {
4479 /*
4480 * We just used the last slot in the snapshot buffer.
4481 * We only want to log it once... so we do it here
4482 * when we notice we've hit the max.
4483 */
4484 printf("memorystatus: WARNING snapshot buffer is full, count %d\n",
4485 memorystatus_jetsam_snapshot_count);
4486 }
4487 }
4488 }
4489 }
4490
4491 exit:
4492 if (entry == NULL) {
4493 /*
4494 * If we reach here, the snapshot buffer could not be updated.
4495 * Most likely, the buffer is full, in which case we would have
4496 * logged a warning in the previous call.
4497 *
4498 * For now, we will stop appending snapshot entries.
4499 * When the buffer is consumed, the snapshot state will reset.
4500 */
4501
4502 MEMORYSTATUS_DEBUG(4, "memorystatus_update_jetsam_snapshot_entry_locked: failed to update pid %d, priority %d, count %d\n",
4503 p->p_pid, p->p_memstat_effectivepriority, memorystatus_jetsam_snapshot_count);
4504 }
4505
4506 return;
4507 }
4508
4509 #if CONFIG_JETSAM
4510 void memorystatus_pages_update(unsigned int pages_avail)
4511 {
4512 memorystatus_available_pages = pages_avail;
4513
4514 #if VM_PRESSURE_EVENTS
4515 /*
4516 * Since memorystatus_available_pages changes, we should
4517 * re-evaluate the pressure levels on the system and
4518 * check if we need to wake the pressure thread.
4519 * We also update memorystatus_level in that routine.
4520 */
4521 vm_pressure_response();
4522
4523 if (memorystatus_available_pages <= memorystatus_available_pages_pressure) {
4524
4525 if (memorystatus_hwm_candidates || (memorystatus_available_pages <= memorystatus_available_pages_critical)) {
4526 memorystatus_thread_wake();
4527 }
4528 }
4529 #else /* VM_PRESSURE_EVENTS */
4530
4531 boolean_t critical, delta;
4532
4533 if (!memorystatus_delta) {
4534 return;
4535 }
4536
4537 critical = (pages_avail < memorystatus_available_pages_critical) ? TRUE : FALSE;
4538 delta = ((pages_avail >= (memorystatus_available_pages + memorystatus_delta))
4539 || (memorystatus_available_pages >= (pages_avail + memorystatus_delta))) ? TRUE : FALSE;
4540
4541 if (critical || delta) {
4542 unsigned int total_pages;
4543
4544 total_pages = (unsigned int) atop_64(max_mem);
4545 #if CONFIG_SECLUDED_MEMORY
4546 total_pages -= vm_page_secluded_count;
4547 #endif /* CONFIG_SECLUDED_MEMORY */
4548 memorystatus_level = memorystatus_available_pages * 100 / total_pages;
4549 memorystatus_thread_wake();
4550 }
4551 #endif /* VM_PRESSURE_EVENTS */
4552 }
4553 #endif /* CONFIG_JETSAM */
4554
4555 static boolean_t
4556 memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount)
4557 {
4558 clock_sec_t tv_sec;
4559 clock_usec_t tv_usec;
4560 uint32_t pages = 0;
4561 uint32_t max_pages = 0;
4562 uint32_t max_pages_lifetime = 0;
4563 uint32_t purgeable_pages = 0;
4564 uint64_t internal_pages = 0;
4565 uint64_t internal_compressed_pages = 0;
4566 uint64_t purgeable_nonvolatile_pages = 0;
4567 uint64_t purgeable_nonvolatile_compressed_pages = 0;
4568 uint64_t alternate_accounting_pages = 0;
4569 uint64_t alternate_accounting_compressed_pages = 0;
4570 uint64_t iokit_mapped_pages = 0;
4571 uint64_t page_table_pages =0;
4572 uint64_t region_count = 0;
4573 uint64_t cids[COALITION_NUM_TYPES];
4574
4575 memset(entry, 0, sizeof(memorystatus_jetsam_snapshot_entry_t));
4576
4577 entry->pid = p->p_pid;
4578 strlcpy(&entry->name[0], p->p_name, sizeof(entry->name));
4579 entry->priority = p->p_memstat_effectivepriority;
4580
4581 memorystatus_get_task_page_counts(p->task, &pages, &max_pages, &max_pages_lifetime, &purgeable_pages);
4582 entry->pages = (uint64_t)pages;
4583 entry->max_pages = (uint64_t)max_pages;
4584 entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
4585 entry->purgeable_pages = (uint64_t)purgeable_pages;
4586
4587 memorystatus_get_task_phys_footprint_page_counts(p->task, &internal_pages, &internal_compressed_pages,
4588 &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages,
4589 &alternate_accounting_pages, &alternate_accounting_compressed_pages,
4590 &iokit_mapped_pages, &page_table_pages);
4591
4592 entry->jse_internal_pages = internal_pages;
4593 entry->jse_internal_compressed_pages = internal_compressed_pages;
4594 entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages;
4595 entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages;
4596 entry->jse_alternate_accounting_pages = alternate_accounting_pages;
4597 entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages;
4598 entry->jse_iokit_mapped_pages = iokit_mapped_pages;
4599 entry->jse_page_table_pages = page_table_pages;
4600
4601 memorystatus_get_task_memory_region_count(p->task, &region_count);
4602 entry->jse_memory_region_count = region_count;
4603
4604 entry->state = memorystatus_build_state(p);
4605 entry->user_data = p->p_memstat_userdata;
4606 memcpy(&entry->uuid[0], &p->p_uuid[0], sizeof(p->p_uuid));
4607 entry->fds = p->p_fd->fd_nfiles;
4608
4609 absolutetime_to_microtime(get_task_cpu_time(p->task), &tv_sec, &tv_usec);
4610 entry->cpu_time.tv_sec = tv_sec;
4611 entry->cpu_time.tv_usec = tv_usec;
4612
4613 assert(p->p_stats != NULL);
4614 entry->jse_starttime = p->p_stats->ps_start; /* abstime process started */
4615 entry->jse_killtime = 0; /* abstime jetsam chose to kill process */
4616 entry->killed = 0; /* the jetsam kill cause */
4617 entry->jse_gencount = gencount; /* indicates a pass through jetsam thread, when process was targeted to be killed */
4618
4619 entry->jse_idle_delta = p->p_memstat_idle_delta; /* Most recent timespan spent in idle-band */
4620
4621 proc_coalitionids(p, cids);
4622 entry->jse_coalition_jetsam_id = cids[COALITION_TYPE_JETSAM];
4623
4624 return TRUE;
4625 }
4626
4627 static void
4628 memorystatus_init_snapshot_vmstats(memorystatus_jetsam_snapshot_t *snapshot)
4629 {
4630 kern_return_t kr = KERN_SUCCESS;
4631 mach_msg_type_number_t count = HOST_VM_INFO64_COUNT;
4632 vm_statistics64_data_t vm_stat;
4633
4634 if ((kr = host_statistics64(host_self(), HOST_VM_INFO64, (host_info64_t)&vm_stat, &count) != KERN_SUCCESS)) {
4635 printf("memorystatus_init_jetsam_snapshot_stats: host_statistics64 failed with %d\n", kr);
4636 memset(&snapshot->stats, 0, sizeof(snapshot->stats));
4637 } else {
4638 snapshot->stats.free_pages = vm_stat.free_count;
4639 snapshot->stats.active_pages = vm_stat.active_count;
4640 snapshot->stats.inactive_pages = vm_stat.inactive_count;
4641 snapshot->stats.throttled_pages = vm_stat.throttled_count;
4642 snapshot->stats.purgeable_pages = vm_stat.purgeable_count;
4643 snapshot->stats.wired_pages = vm_stat.wire_count;
4644
4645 snapshot->stats.speculative_pages = vm_stat.speculative_count;
4646 snapshot->stats.filebacked_pages = vm_stat.external_page_count;
4647 snapshot->stats.anonymous_pages = vm_stat.internal_page_count;
4648 snapshot->stats.compressions = vm_stat.compressions;
4649 snapshot->stats.decompressions = vm_stat.decompressions;
4650 snapshot->stats.compressor_pages = vm_stat.compressor_page_count;
4651 snapshot->stats.total_uncompressed_pages_in_compressor = vm_stat.total_uncompressed_pages_in_compressor;
4652 }
4653
4654 get_zone_map_size(&snapshot->stats.zone_map_size, &snapshot->stats.zone_map_capacity);
4655 get_largest_zone_info(snapshot->stats.largest_zone_name, sizeof(snapshot->stats.largest_zone_name),
4656 &snapshot->stats.largest_zone_size);
4657 }
4658
4659 /*
4660 * Collect vm statistics at boot.
4661 * Called only once (see kern_exec.c)
4662 * Data can be consumed at any time.
4663 */
4664 void
4665 memorystatus_init_at_boot_snapshot() {
4666 memorystatus_init_snapshot_vmstats(&memorystatus_at_boot_snapshot);
4667 memorystatus_at_boot_snapshot.entry_count = 0;
4668 memorystatus_at_boot_snapshot.notification_time = 0; /* updated when consumed */
4669 memorystatus_at_boot_snapshot.snapshot_time = mach_absolute_time();
4670 }
4671
4672 static void
4673 memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count )
4674 {
4675 proc_t p, next_p;
4676 unsigned int b = 0, i = 0;
4677
4678 memorystatus_jetsam_snapshot_t *snapshot = NULL;
4679 memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
4680 unsigned int snapshot_max = 0;
4681
4682 if (od_snapshot) {
4683 /*
4684 * This is an on_demand snapshot
4685 */
4686 snapshot = od_snapshot;
4687 snapshot_list = od_snapshot->entries;
4688 snapshot_max = ods_list_count;
4689 } else {
4690 /*
4691 * This is a jetsam event snapshot
4692 */
4693 snapshot = memorystatus_jetsam_snapshot;
4694 snapshot_list = memorystatus_jetsam_snapshot->entries;
4695 snapshot_max = memorystatus_jetsam_snapshot_max;
4696 }
4697
4698 /*
4699 * Init the snapshot header information
4700 */
4701 memorystatus_init_snapshot_vmstats(snapshot);
4702 snapshot->snapshot_time = mach_absolute_time();
4703 snapshot->notification_time = 0;
4704 snapshot->js_gencount = 0;
4705
4706 next_p = memorystatus_get_first_proc_locked(&b, TRUE);
4707 while (next_p) {
4708 p = next_p;
4709 next_p = memorystatus_get_next_proc_locked(&b, p, TRUE);
4710
4711 if (FALSE == memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i], snapshot->js_gencount)) {
4712 continue;
4713 }
4714
4715 MEMORYSTATUS_DEBUG(0, "jetsam snapshot pid %d, uuid = %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
4716 p->p_pid,
4717 p->p_uuid[0], p->p_uuid[1], p->p_uuid[2], p->p_uuid[3], p->p_uuid[4], p->p_uuid[5], p->p_uuid[6], p->p_uuid[7],
4718 p->p_uuid[8], p->p_uuid[9], p->p_uuid[10], p->p_uuid[11], p->p_uuid[12], p->p_uuid[13], p->p_uuid[14], p->p_uuid[15]);
4719
4720 if (++i == snapshot_max) {
4721 break;
4722 }
4723 }
4724
4725 snapshot->entry_count = i;
4726
4727 if (!od_snapshot) {
4728 /* update the system buffer count */
4729 memorystatus_jetsam_snapshot_count = i;
4730 }
4731 }
4732
4733 #if DEVELOPMENT || DEBUG
4734
4735 #if CONFIG_JETSAM
4736 static int
4737 memorystatus_cmd_set_panic_bits(user_addr_t buffer, uint32_t buffer_size) {
4738 int ret;
4739 memorystatus_jetsam_panic_options_t debug;
4740
4741 if (buffer_size != sizeof(memorystatus_jetsam_panic_options_t)) {
4742 return EINVAL;
4743 }
4744
4745 ret = copyin(buffer, &debug, buffer_size);
4746 if (ret) {
4747 return ret;
4748 }
4749
4750 /* Panic bits match kMemorystatusKilled* enum */
4751 memorystatus_jetsam_panic_debug = (memorystatus_jetsam_panic_debug & ~debug.mask) | (debug.data & debug.mask);
4752
4753 /* Copyout new value */
4754 debug.data = memorystatus_jetsam_panic_debug;
4755 ret = copyout(&debug, buffer, sizeof(memorystatus_jetsam_panic_options_t));
4756
4757 return ret;
4758 }
4759 #endif /* CONFIG_JETSAM */
4760
4761 /*
4762 * Triggers a sort_order on a specified jetsam priority band.
4763 * This is for testing only, used to force a path through the sort
4764 * function.
4765 */
4766 static int
4767 memorystatus_cmd_test_jetsam_sort(int priority, int sort_order) {
4768
4769 int error = 0;
4770
4771 unsigned int bucket_index = 0;
4772
4773 if (priority == -1) {
4774 /* Use as shorthand for default priority */
4775 bucket_index = JETSAM_PRIORITY_DEFAULT;
4776 } else {
4777 bucket_index = (unsigned int)priority;
4778 }
4779
4780 error = memorystatus_sort_bucket(bucket_index, sort_order);
4781
4782 return (error);
4783 }
4784
4785 #endif /* DEVELOPMENT || DEBUG */
4786
4787 /*
4788 * Jetsam the first process in the queue.
4789 */
4790 static boolean_t
4791 memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, os_reason_t jetsam_reason,
4792 int32_t *priority, uint32_t *errors)
4793 {
4794 pid_t aPid;
4795 proc_t p = PROC_NULL, next_p = PROC_NULL;
4796 boolean_t new_snapshot = FALSE, force_new_snapshot = FALSE, killed = FALSE;
4797 int kill_count = 0;
4798 unsigned int i = 0;
4799 uint32_t aPid_ep;
4800 uint64_t killtime = 0;
4801 clock_sec_t tv_sec;
4802 clock_usec_t tv_usec;
4803 uint32_t tv_msec;
4804 int32_t local_max_kill_prio = JETSAM_PRIORITY_IDLE;
4805
4806 #ifndef CONFIG_FREEZE
4807 #pragma unused(any)
4808 #endif
4809
4810 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
4811 memorystatus_available_pages, 0, 0, 0, 0);
4812
4813
4814 #if CONFIG_JETSAM
4815 if (sort_flag == TRUE) {
4816 (void)memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT);
4817 }
4818
4819 local_max_kill_prio = max_kill_priority;
4820
4821 force_new_snapshot = FALSE;
4822
4823 #else /* CONFIG_JETSAM */
4824
4825 if (sort_flag == TRUE) {
4826 (void)memorystatus_sort_bucket(JETSAM_PRIORITY_IDLE, JETSAM_SORT_DEFAULT);
4827 }
4828
4829 /*
4830 * On macos, we currently only have 2 reasons to be here:
4831 *
4832 * kMemorystatusKilledZoneMapExhaustion
4833 * AND
4834 * kMemorystatusKilledVMThrashing
4835 *
4836 * If we are here because of kMemorystatusKilledZoneMapExhaustion, we will consider
4837 * any and all processes as eligible kill candidates since we need to avoid a panic.
4838 *
4839 * Since this function can be called async. it is harder to toggle the max_kill_priority
4840 * value before and after a call. And so we use this local variable to set the upper band
4841 * on the eligible kill bands.
4842 */
4843 if (cause == kMemorystatusKilledZoneMapExhaustion) {
4844 local_max_kill_prio = JETSAM_PRIORITY_MAX;
4845 } else {
4846 local_max_kill_prio = max_kill_priority;
4847 }
4848
4849 /*
4850 * And, because we are here under extreme circumstances, we force a snapshot even for
4851 * IDLE kills.
4852 */
4853 force_new_snapshot = TRUE;
4854
4855 #endif /* CONFIG_JETSAM */
4856
4857 proc_list_lock();
4858
4859 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
4860 while (next_p && (next_p->p_memstat_effectivepriority <= local_max_kill_prio)) {
4861 #if DEVELOPMENT || DEBUG
4862 int activeProcess;
4863 int procSuspendedForDiagnosis;
4864 #endif /* DEVELOPMENT || DEBUG */
4865
4866 p = next_p;
4867 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
4868
4869 #if DEVELOPMENT || DEBUG
4870 activeProcess = p->p_memstat_state & P_MEMSTAT_FOREGROUND;
4871 procSuspendedForDiagnosis = p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED;
4872 #endif /* DEVELOPMENT || DEBUG */
4873
4874 aPid = p->p_pid;
4875 aPid_ep = p->p_memstat_effectivepriority;
4876
4877 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
4878 continue; /* with lock held */
4879 }
4880
4881 #if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
4882 if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && procSuspendedForDiagnosis) {
4883 printf("jetsam: continuing after ignoring proc suspended already for diagnosis - %d\n", aPid);
4884 continue;
4885 }
4886 #endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
4887
4888 if (cause == kMemorystatusKilledVnodes)
4889 {
4890 /*
4891 * If the system runs out of vnodes, we systematically jetsam
4892 * processes in hopes of stumbling onto a vnode gain that helps
4893 * the system recover. The process that happens to trigger
4894 * this path has no known relationship to the vnode shortage.
4895 * Deadlock avoidance: attempt to safeguard the caller.
4896 */
4897
4898 if (p == current_proc()) {
4899 /* do not jetsam the current process */
4900 continue;
4901 }
4902 }
4903
4904 #if CONFIG_FREEZE
4905 boolean_t skip;
4906 boolean_t reclaim_proc = !(p->p_memstat_state & (P_MEMSTAT_LOCKED | P_MEMSTAT_NORECLAIM));
4907 if (any || reclaim_proc) {
4908 skip = FALSE;
4909 } else {
4910 skip = TRUE;
4911 }
4912
4913 if (skip) {
4914 continue;
4915 } else
4916 #endif
4917 {
4918 /*
4919 * Capture a snapshot if none exists and:
4920 * - we are forcing a new snapshot creation, either because:
4921 * - on a particular platform we need these snapshots every time, OR
4922 * - a boot-arg/embedded device tree property has been set.
4923 * - priority was not requested (this is something other than an ambient kill)
4924 * - the priority was requested *and* the targeted process is not at idle priority
4925 */
4926 if ((memorystatus_jetsam_snapshot_count == 0) &&
4927 (force_new_snapshot || memorystatus_idle_snapshot || ((!priority) || (priority && (aPid_ep != JETSAM_PRIORITY_IDLE))))) {
4928 memorystatus_init_jetsam_snapshot_locked(NULL,0);
4929 new_snapshot = TRUE;
4930 }
4931
4932 /*
4933 * Mark as terminated so that if exit1() indicates success, but the process (for example)
4934 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
4935 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
4936 * acquisition of the proc lock.
4937 */
4938 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
4939
4940 killtime = mach_absolute_time();
4941 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
4942 tv_msec = tv_usec / 1000;
4943
4944 #if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
4945 if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && activeProcess) {
4946 MEMORYSTATUS_DEBUG(1, "jetsam: suspending pid %d [%s] (active) for diagnosis - memory_status_level: %d\n",
4947 aPid, (*p->p_name ? p->p_name: "(unknown)"), memorystatus_level);
4948 memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusKilledDiagnostic, killtime);
4949 p->p_memstat_state |= P_MEMSTAT_DIAG_SUSPENDED;
4950 if (memorystatus_jetsam_policy & kPolicyDiagnoseFirst) {
4951 jetsam_diagnostic_suspended_one_active_proc = 1;
4952 printf("jetsam: returning after suspending first active proc - %d\n", aPid);
4953 }
4954
4955 p = proc_ref_locked(p);
4956 proc_list_unlock();
4957 if (p) {
4958 task_suspend(p->task);
4959 if (priority) {
4960 *priority = aPid_ep;
4961 }
4962 proc_rele(p);
4963 killed = TRUE;
4964 }
4965
4966 goto exit;
4967 } else
4968 #endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
4969 {
4970 /* Shift queue, update stats */
4971 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
4972
4973 if (proc_ref_locked(p) == p) {
4974 proc_list_unlock();
4975 os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: %s pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n",
4976 (unsigned long)tv_sec, tv_msec,
4977 ((aPid_ep == JETSAM_PRIORITY_IDLE) ? "killing_idle_process" : "killing_top_process"),
4978 aPid, (*p->p_name ? p->p_name : "unknown"),
4979 memorystatus_kill_cause_name[cause], aPid_ep, (uint64_t)memorystatus_available_pages);
4980
4981 /*
4982 * memorystatus_do_kill() drops a reference, so take another one so we can
4983 * continue to use this exit reason even after memorystatus_do_kill()
4984 * returns.
4985 */
4986 os_reason_ref(jetsam_reason);
4987
4988 killed = memorystatus_do_kill(p, cause, jetsam_reason);
4989
4990 /* Success? */
4991 if (killed) {
4992 if (priority) {
4993 *priority = aPid_ep;
4994 }
4995 proc_rele(p);
4996 kill_count++;
4997 goto exit;
4998 }
4999
5000 /*
5001 * Failure - first unwind the state,
5002 * then fall through to restart the search.
5003 */
5004 proc_list_lock();
5005 proc_rele_locked(p);
5006 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
5007 p->p_memstat_state |= P_MEMSTAT_ERROR;
5008 *errors += 1;
5009 }
5010
5011 /*
5012 * Failure - restart the search.
5013 *
5014 * We might have raced with "p" exiting on another core, resulting in no
5015 * ref on "p". Or, we may have failed to kill "p".
5016 *
5017 * Either way, we fall thru to here, leaving the proc in the
5018 * P_MEMSTAT_TERMINATED state.
5019 *
5020 * And, we hold the the proc_list_lock at this point.
5021 */
5022
5023 i = 0;
5024 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5025 }
5026 }
5027 }
5028
5029 proc_list_unlock();
5030
5031 exit:
5032 os_reason_free(jetsam_reason);
5033
5034 /* Clear snapshot if freshly captured and no target was found */
5035 if (new_snapshot && !killed) {
5036 proc_list_lock();
5037 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
5038 proc_list_unlock();
5039 }
5040
5041 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
5042 memorystatus_available_pages, killed ? aPid : 0, kill_count, 0, 0);
5043
5044 return killed;
5045 }
5046
5047 /*
5048 * Jetsam aggressively
5049 */
5050 static boolean_t
5051 memorystatus_kill_top_process_aggressive(uint32_t cause, int aggr_count,
5052 int32_t priority_max, uint32_t *errors)
5053 {
5054 pid_t aPid;
5055 proc_t p = PROC_NULL, next_p = PROC_NULL;
5056 boolean_t new_snapshot = FALSE, killed = FALSE;
5057 int kill_count = 0;
5058 unsigned int i = 0;
5059 int32_t aPid_ep = 0;
5060 unsigned int memorystatus_level_snapshot = 0;
5061 uint64_t killtime = 0;
5062 clock_sec_t tv_sec;
5063 clock_usec_t tv_usec;
5064 uint32_t tv_msec;
5065 os_reason_t jetsam_reason = OS_REASON_NULL;
5066
5067 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
5068 memorystatus_available_pages, priority_max, 0, 0, 0);
5069
5070 memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT);
5071
5072 jetsam_reason = os_reason_create(OS_REASON_JETSAM, cause);
5073 if (jetsam_reason == OS_REASON_NULL) {
5074 printf("memorystatus_kill_top_process_aggressive: failed to allocate exit reason\n");
5075 }
5076
5077 proc_list_lock();
5078
5079 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5080 while (next_p) {
5081 #if DEVELOPMENT || DEBUG
5082 int activeProcess;
5083 int procSuspendedForDiagnosis;
5084 #endif /* DEVELOPMENT || DEBUG */
5085
5086 if (((next_p->p_listflag & P_LIST_EXITED) != 0) ||
5087 ((unsigned int)(next_p->p_memstat_effectivepriority) != i)) {
5088
5089 /*
5090 * We have raced with next_p running on another core.
5091 * It may be exiting or it may have moved to a different
5092 * jetsam priority band. This means we have lost our
5093 * place in line while traversing the jetsam list. We
5094 * attempt to recover by rewinding to the beginning of the band
5095 * we were already traversing. By doing this, we do not guarantee
5096 * that no process escapes this aggressive march, but we can make
5097 * skipping an entire range of processes less likely. (PR-21069019)
5098 */
5099
5100 MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: rewinding band %d, %s(%d) moved or exiting.\n",
5101 aggr_count, i, (*next_p->p_name ? next_p->p_name : "unknown"), next_p->p_pid);
5102
5103 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5104 continue;
5105 }
5106
5107 p = next_p;
5108 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
5109
5110 if (p->p_memstat_effectivepriority > priority_max) {
5111 /*
5112 * Bail out of this killing spree if we have
5113 * reached beyond the priority_max jetsam band.
5114 * That is, we kill up to and through the
5115 * priority_max jetsam band.
5116 */
5117 proc_list_unlock();
5118 goto exit;
5119 }
5120
5121 #if DEVELOPMENT || DEBUG
5122 activeProcess = p->p_memstat_state & P_MEMSTAT_FOREGROUND;
5123 procSuspendedForDiagnosis = p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED;
5124 #endif /* DEVELOPMENT || DEBUG */
5125
5126 aPid = p->p_pid;
5127 aPid_ep = p->p_memstat_effectivepriority;
5128
5129 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
5130 continue;
5131 }
5132
5133 #if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
5134 if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && procSuspendedForDiagnosis) {
5135 printf("jetsam: continuing after ignoring proc suspended already for diagnosis - %d\n", aPid);
5136 continue;
5137 }
5138 #endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
5139
5140 /*
5141 * Capture a snapshot if none exists.
5142 */
5143 if (memorystatus_jetsam_snapshot_count == 0) {
5144 memorystatus_init_jetsam_snapshot_locked(NULL,0);
5145 new_snapshot = TRUE;
5146 }
5147
5148 /*
5149 * Mark as terminated so that if exit1() indicates success, but the process (for example)
5150 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
5151 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
5152 * acquisition of the proc lock.
5153 */
5154 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
5155
5156 killtime = mach_absolute_time();
5157 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
5158 tv_msec = tv_usec / 1000;
5159
5160 /* Shift queue, update stats */
5161 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
5162
5163 /*
5164 * In order to kill the target process, we will drop the proc_list_lock.
5165 * To guaranteee that p and next_p don't disappear out from under the lock,
5166 * we must take a ref on both.
5167 * If we cannot get a reference, then it's likely we've raced with
5168 * that process exiting on another core.
5169 */
5170 if (proc_ref_locked(p) == p) {
5171 if (next_p) {
5172 while (next_p && (proc_ref_locked(next_p) != next_p)) {
5173 proc_t temp_p;
5174
5175 /*
5176 * We must have raced with next_p exiting on another core.
5177 * Recover by getting the next eligible process in the band.
5178 */
5179
5180 MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: skipping %d [%s] (exiting?)\n",
5181 aggr_count, next_p->p_pid, (*next_p->p_name ? next_p->p_name : "(unknown)"));
5182
5183 temp_p = next_p;
5184 next_p = memorystatus_get_next_proc_locked(&i, temp_p, TRUE);
5185 }
5186 }
5187 proc_list_unlock();
5188
5189 printf("%lu.%03d memorystatus: %s%d pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n",
5190 (unsigned long)tv_sec, tv_msec,
5191 ((aPid_ep == JETSAM_PRIORITY_IDLE) ? "killing_idle_process_aggressive" : "killing_top_process_aggressive"),
5192 aggr_count, aPid, (*p->p_name ? p->p_name : "unknown"),
5193 memorystatus_kill_cause_name[cause], aPid_ep, (uint64_t)memorystatus_available_pages);
5194
5195 memorystatus_level_snapshot = memorystatus_level;
5196
5197 /*
5198 * memorystatus_do_kill() drops a reference, so take another one so we can
5199 * continue to use this exit reason even after memorystatus_do_kill()
5200 * returns.
5201 */
5202 os_reason_ref(jetsam_reason);
5203 killed = memorystatus_do_kill(p, cause, jetsam_reason);
5204
5205 /* Success? */
5206 if (killed) {
5207 proc_rele(p);
5208 kill_count++;
5209 p = NULL;
5210 killed = FALSE;
5211
5212 /*
5213 * Continue the killing spree.
5214 */
5215 proc_list_lock();
5216 if (next_p) {
5217 proc_rele_locked(next_p);
5218 }
5219
5220 if (aPid_ep == JETSAM_PRIORITY_FOREGROUND && memorystatus_aggressive_jetsam_lenient == TRUE) {
5221 if (memorystatus_level > memorystatus_level_snapshot && ((memorystatus_level - memorystatus_level_snapshot) >= AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD)) {
5222 #if DEVELOPMENT || DEBUG
5223 printf("Disabling Lenient mode after one-time deployment.\n");
5224 #endif /* DEVELOPMENT || DEBUG */
5225 memorystatus_aggressive_jetsam_lenient = FALSE;
5226 break;
5227 }
5228 }
5229
5230 continue;
5231 }
5232
5233 /*
5234 * Failure - first unwind the state,
5235 * then fall through to restart the search.
5236 */
5237 proc_list_lock();
5238 proc_rele_locked(p);
5239 if (next_p) {
5240 proc_rele_locked(next_p);
5241 }
5242 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
5243 p->p_memstat_state |= P_MEMSTAT_ERROR;
5244 *errors += 1;
5245 p = NULL;
5246 }
5247
5248 /*
5249 * Failure - restart the search at the beginning of
5250 * the band we were already traversing.
5251 *
5252 * We might have raced with "p" exiting on another core, resulting in no
5253 * ref on "p". Or, we may have failed to kill "p".
5254 *
5255 * Either way, we fall thru to here, leaving the proc in the
5256 * P_MEMSTAT_TERMINATED or P_MEMSTAT_ERROR state.
5257 *
5258 * And, we hold the the proc_list_lock at this point.
5259 */
5260
5261 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5262 }
5263
5264 proc_list_unlock();
5265
5266 exit:
5267 os_reason_free(jetsam_reason);
5268
5269 /* Clear snapshot if freshly captured and no target was found */
5270 if (new_snapshot && (kill_count == 0)) {
5271 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
5272 }
5273
5274 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
5275 memorystatus_available_pages, killed ? aPid : 0, kill_count, 0, 0);
5276
5277 if (kill_count > 0) {
5278 return(TRUE);
5279 }
5280 else {
5281 return(FALSE);
5282 }
5283 }
5284
5285 static boolean_t
5286 memorystatus_kill_hiwat_proc(uint32_t *errors)
5287 {
5288 pid_t aPid = 0;
5289 proc_t p = PROC_NULL, next_p = PROC_NULL;
5290 boolean_t new_snapshot = FALSE, killed = FALSE;
5291 int kill_count = 0;
5292 unsigned int i = 0;
5293 uint32_t aPid_ep;
5294 uint64_t killtime = 0;
5295 clock_sec_t tv_sec;
5296 clock_usec_t tv_usec;
5297 uint32_t tv_msec;
5298 os_reason_t jetsam_reason = OS_REASON_NULL;
5299 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_START,
5300 memorystatus_available_pages, 0, 0, 0, 0);
5301
5302 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_HIGHWATER);
5303 if (jetsam_reason == OS_REASON_NULL) {
5304 printf("memorystatus_kill_hiwat_proc: failed to allocate exit reason\n");
5305 }
5306
5307 proc_list_lock();
5308
5309 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5310 while (next_p) {
5311 uint64_t footprint_in_bytes = 0;
5312 uint64_t memlimit_in_bytes = 0;
5313 boolean_t skip = 0;
5314
5315 p = next_p;
5316 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
5317
5318 aPid = p->p_pid;
5319 aPid_ep = p->p_memstat_effectivepriority;
5320
5321 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
5322 continue;
5323 }
5324
5325 /* skip if no limit set */
5326 if (p->p_memstat_memlimit <= 0) {
5327 continue;
5328 }
5329
5330 footprint_in_bytes = get_task_phys_footprint(p->task);
5331 memlimit_in_bytes = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL); /* convert MB to bytes */
5332 skip = (footprint_in_bytes <= memlimit_in_bytes);
5333
5334 #if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
5335 if (!skip && (memorystatus_jetsam_policy & kPolicyDiagnoseActive)) {
5336 if (p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED) {
5337 continue;
5338 }
5339 }
5340 #endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
5341
5342 #if CONFIG_FREEZE
5343 if (!skip) {
5344 if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
5345 skip = TRUE;
5346 } else {
5347 skip = FALSE;
5348 }
5349 }
5350 #endif
5351
5352 if (skip) {
5353 continue;
5354 } else {
5355 #if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
5356 MEMORYSTATUS_DEBUG(1, "jetsam: %s pid %d [%s] - %lld Mb > 1 (%d Mb)\n",
5357 (memorystatus_jetsam_policy & kPolicyDiagnoseActive) ? "suspending": "killing",
5358 aPid, (*p->p_name ? p->p_name : "unknown"),
5359 (footprint_in_bytes / (1024ULL * 1024ULL)), /* converted bytes to MB */
5360 p->p_memstat_memlimit);
5361 #endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
5362
5363 if (memorystatus_jetsam_snapshot_count == 0) {
5364 memorystatus_init_jetsam_snapshot_locked(NULL,0);
5365 new_snapshot = TRUE;
5366 }
5367
5368 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
5369
5370 killtime = mach_absolute_time();
5371 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
5372 tv_msec = tv_usec / 1000;
5373
5374 #if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
5375 if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
5376 MEMORYSTATUS_DEBUG(1, "jetsam: pid %d suspended for diagnosis - memorystatus_available_pages: %d\n", aPid, memorystatus_available_pages);
5377 memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusKilledDiagnostic, killtime);
5378 p->p_memstat_state |= P_MEMSTAT_DIAG_SUSPENDED;
5379
5380 p = proc_ref_locked(p);
5381 proc_list_unlock();
5382 if (p) {
5383 task_suspend(p->task);
5384 proc_rele(p);
5385 killed = TRUE;
5386 }
5387
5388 goto exit;
5389 } else
5390 #endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
5391 {
5392 memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusKilledHiwat, killtime);
5393
5394 if (proc_ref_locked(p) == p) {
5395 proc_list_unlock();
5396
5397 os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: killing_highwater_process pid %d [%s] (highwater %d) - memorystatus_available_pages: %llu\n",
5398 (unsigned long)tv_sec, tv_msec, aPid, (*p->p_name ? p->p_name : "unknown"), aPid_ep, (uint64_t)memorystatus_available_pages);
5399
5400 /*
5401 * memorystatus_do_kill drops a reference, so take another one so we can
5402 * continue to use this exit reason even after memorystatus_do_kill()
5403 * returns
5404 */
5405 os_reason_ref(jetsam_reason);
5406
5407 killed = memorystatus_do_kill(p, kMemorystatusKilledHiwat, jetsam_reason);
5408
5409 /* Success? */
5410 if (killed) {
5411 proc_rele(p);
5412 kill_count++;
5413 goto exit;
5414 }
5415
5416 /*
5417 * Failure - first unwind the state,
5418 * then fall through to restart the search.
5419 */
5420 proc_list_lock();
5421 proc_rele_locked(p);
5422 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
5423 p->p_memstat_state |= P_MEMSTAT_ERROR;
5424 *errors += 1;
5425 }
5426
5427 /*
5428 * Failure - restart the search.
5429 *
5430 * We might have raced with "p" exiting on another core, resulting in no
5431 * ref on "p". Or, we may have failed to kill "p".
5432 *
5433 * Either way, we fall thru to here, leaving the proc in the
5434 * P_MEMSTAT_TERMINATED state.
5435 *
5436 * And, we hold the the proc_list_lock at this point.
5437 */
5438
5439 i = 0;
5440 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5441 }
5442 }
5443 }
5444
5445 proc_list_unlock();
5446
5447 exit:
5448 os_reason_free(jetsam_reason);
5449
5450 /* Clear snapshot if freshly captured and no target was found */
5451 if (new_snapshot && !killed) {
5452 proc_list_lock();
5453 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
5454 proc_list_unlock();
5455 }
5456
5457 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_END,
5458 memorystatus_available_pages, killed ? aPid : 0, kill_count, 0, 0);
5459
5460 return killed;
5461 }
5462
5463 /*
5464 * Jetsam a process pinned in the elevated band.
5465 *
5466 * Return: true -- at least one pinned process was jetsammed
5467 * false -- no pinned process was jetsammed
5468 */
5469 static boolean_t
5470 memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, int aggr_count, uint32_t *errors)
5471 {
5472 pid_t aPid = 0;
5473 proc_t p = PROC_NULL, next_p = PROC_NULL;
5474 boolean_t new_snapshot = FALSE, killed = FALSE;
5475 int kill_count = 0;
5476 unsigned int i = JETSAM_PRIORITY_ELEVATED_INACTIVE;
5477 uint32_t aPid_ep;
5478 uint64_t killtime = 0;
5479 clock_sec_t tv_sec;
5480 clock_usec_t tv_usec;
5481 uint32_t tv_msec;
5482
5483
5484 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
5485 memorystatus_available_pages, 0, 0, 0, 0);
5486
5487 proc_list_lock();
5488
5489 next_p = memorystatus_get_first_proc_locked(&i, FALSE);
5490 while (next_p) {
5491
5492 p = next_p;
5493 next_p = memorystatus_get_next_proc_locked(&i, p, FALSE);
5494
5495 aPid = p->p_pid;
5496 aPid_ep = p->p_memstat_effectivepriority;
5497
5498 /*
5499 * Only pick a process pinned in this elevated band
5500 */
5501 if (!(p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) {
5502 continue;
5503 }
5504
5505 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
5506 continue;
5507 }
5508
5509 #if CONFIG_FREEZE
5510 if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
5511 continue;
5512 }
5513 #endif
5514
5515 #if DEVELOPMENT || DEBUG
5516 MEMORYSTATUS_DEBUG(1, "jetsam: elevated%d process pid %d [%s] - memorystatus_available_pages: %d\n",
5517 aggr_count,
5518 aPid, (*p->p_name ? p->p_name : "unknown"),
5519 memorystatus_available_pages);
5520 #endif /* DEVELOPMENT || DEBUG */
5521
5522 if (memorystatus_jetsam_snapshot_count == 0) {
5523 memorystatus_init_jetsam_snapshot_locked(NULL,0);
5524 new_snapshot = TRUE;
5525 }
5526
5527 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
5528
5529 killtime = mach_absolute_time();
5530 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
5531 tv_msec = tv_usec / 1000;
5532
5533 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
5534
5535 if (proc_ref_locked(p) == p) {
5536
5537 proc_list_unlock();
5538
5539 os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: killing_top_process_elevated%d pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n",
5540 (unsigned long)tv_sec, tv_msec,
5541 aggr_count,
5542 aPid, (*p->p_name ? p->p_name : "unknown"),
5543 memorystatus_kill_cause_name[cause], aPid_ep, (uint64_t)memorystatus_available_pages);
5544
5545 /*
5546 * memorystatus_do_kill drops a reference, so take another one so we can
5547 * continue to use this exit reason even after memorystatus_do_kill()
5548 * returns
5549 */
5550 os_reason_ref(jetsam_reason);
5551 killed = memorystatus_do_kill(p, cause, jetsam_reason);
5552
5553 /* Success? */
5554 if (killed) {
5555 proc_rele(p);
5556 kill_count++;
5557 goto exit;
5558 }
5559
5560 /*
5561 * Failure - first unwind the state,
5562 * then fall through to restart the search.
5563 */
5564 proc_list_lock();
5565 proc_rele_locked(p);
5566 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
5567 p->p_memstat_state |= P_MEMSTAT_ERROR;
5568 *errors += 1;
5569 }
5570
5571 /*
5572 * Failure - restart the search.
5573 *
5574 * We might have raced with "p" exiting on another core, resulting in no
5575 * ref on "p". Or, we may have failed to kill "p".
5576 *
5577 * Either way, we fall thru to here, leaving the proc in the
5578 * P_MEMSTAT_TERMINATED state or P_MEMSTAT_ERROR state.
5579 *
5580 * And, we hold the the proc_list_lock at this point.
5581 */
5582
5583 next_p = memorystatus_get_first_proc_locked(&i, FALSE);
5584 }
5585
5586 proc_list_unlock();
5587
5588 exit:
5589 os_reason_free(jetsam_reason);
5590
5591 /* Clear snapshot if freshly captured and no target was found */
5592 if (new_snapshot && (kill_count == 0)) {
5593 proc_list_lock();
5594 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
5595 proc_list_unlock();
5596 }
5597
5598 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
5599 memorystatus_available_pages, killed ? aPid : 0, kill_count, 0, 0);
5600
5601 return (killed);
5602 }
5603
5604 static boolean_t
5605 memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause) {
5606 /*
5607 * TODO: allow a general async path
5608 *
5609 * NOTE: If a new async kill cause is added, make sure to update memorystatus_thread() to
5610 * add the appropriate exit reason code mapping.
5611 */
5612 if ((victim_pid != -1) || (cause != kMemorystatusKilledVMPageShortage && cause != kMemorystatusKilledVMThrashing &&
5613 cause != kMemorystatusKilledFCThrashing && cause != kMemorystatusKilledZoneMapExhaustion)) {
5614 return FALSE;
5615 }
5616
5617 kill_under_pressure_cause = cause;
5618 memorystatus_thread_wake();
5619 return TRUE;
5620 }
5621
5622 boolean_t
5623 memorystatus_kill_on_VM_thrashing(boolean_t async) {
5624 if (async) {
5625 return memorystatus_kill_process_async(-1, kMemorystatusKilledVMThrashing);
5626 } else {
5627 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMTHRASHING);
5628 if (jetsam_reason == OS_REASON_NULL) {
5629 printf("memorystatus_kill_on_VM_thrashing -- sync: failed to allocate jetsam reason\n");
5630 }
5631
5632 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMThrashing, jetsam_reason);
5633 }
5634 }
5635
5636 #if CONFIG_JETSAM
5637 boolean_t
5638 memorystatus_kill_on_VM_page_shortage(boolean_t async) {
5639 if (async) {
5640 return memorystatus_kill_process_async(-1, kMemorystatusKilledVMPageShortage);
5641 } else {
5642 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMPAGESHORTAGE);
5643 if (jetsam_reason == OS_REASON_NULL) {
5644 printf("memorystatus_kill_on_VM_page_shortage -- sync: failed to allocate jetsam reason\n");
5645 }
5646
5647 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMPageShortage, jetsam_reason);
5648 }
5649 }
5650
5651 boolean_t
5652 memorystatus_kill_on_FC_thrashing(boolean_t async) {
5653
5654
5655 if (async) {
5656 return memorystatus_kill_process_async(-1, kMemorystatusKilledFCThrashing);
5657 } else {
5658 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_FCTHRASHING);
5659 if (jetsam_reason == OS_REASON_NULL) {
5660 printf("memorystatus_kill_on_FC_thrashing -- sync: failed to allocate jetsam reason\n");
5661 }
5662
5663 return memorystatus_kill_process_sync(-1, kMemorystatusKilledFCThrashing, jetsam_reason);
5664 }
5665 }
5666
5667 boolean_t
5668 memorystatus_kill_on_vnode_limit(void) {
5669 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_VNODE);
5670 if (jetsam_reason == OS_REASON_NULL) {
5671 printf("memorystatus_kill_on_vnode_limit: failed to allocate jetsam reason\n");
5672 }
5673
5674 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVnodes, jetsam_reason);
5675 }
5676
5677 #endif /* CONFIG_JETSAM */
5678
5679 boolean_t
5680 memorystatus_kill_on_zone_map_exhaustion(pid_t pid) {
5681 boolean_t res = FALSE;
5682 if (pid == -1) {
5683 res = memorystatus_kill_process_async(-1, kMemorystatusKilledZoneMapExhaustion);
5684 } else {
5685 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_ZONE_MAP_EXHAUSTION);
5686 if (jetsam_reason == OS_REASON_NULL) {
5687 printf("memorystatus_kill_on_zone_map_exhaustion: failed to allocate jetsam reason\n");
5688 }
5689
5690 res = memorystatus_kill_process_sync(pid, kMemorystatusKilledZoneMapExhaustion, jetsam_reason);
5691 }
5692 return res;
5693 }
5694
5695 #if CONFIG_FREEZE
5696
5697 __private_extern__ void
5698 memorystatus_freeze_init(void)
5699 {
5700 kern_return_t result;
5701 thread_t thread;
5702
5703 freezer_lck_grp_attr = lck_grp_attr_alloc_init();
5704 freezer_lck_grp = lck_grp_alloc_init("freezer", freezer_lck_grp_attr);
5705
5706 lck_mtx_init(&freezer_mutex, freezer_lck_grp, NULL);
5707
5708 result = kernel_thread_start(memorystatus_freeze_thread, NULL, &thread);
5709 if (result == KERN_SUCCESS) {
5710 thread_deallocate(thread);
5711 } else {
5712 panic("Could not create memorystatus_freeze_thread");
5713 }
5714 }
5715
5716 /*
5717 * Synchronously freeze the passed proc. Called with a reference to the proc held.
5718 *
5719 * Returns EINVAL or the value returned by task_freeze().
5720 */
5721 int
5722 memorystatus_freeze_process_sync(proc_t p)
5723 {
5724 int ret = EINVAL;
5725 pid_t aPid = 0;
5726 boolean_t memorystatus_freeze_swap_low = FALSE;
5727
5728 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START,
5729 memorystatus_available_pages, 0, 0, 0, 0);
5730
5731 lck_mtx_lock(&freezer_mutex);
5732
5733 if (p == NULL) {
5734 goto exit;
5735 }
5736
5737 if (memorystatus_freeze_enabled == FALSE) {
5738 goto exit;
5739 }
5740
5741 if (!memorystatus_can_freeze(&memorystatus_freeze_swap_low)) {
5742 goto exit;
5743 }
5744
5745 if (memorystatus_freeze_update_throttle()) {
5746 printf("memorystatus_freeze_process_sync: in throttle, ignorning freeze\n");
5747 memorystatus_freeze_throttle_count++;
5748 goto exit;
5749 }
5750
5751 proc_list_lock();
5752
5753 if (p != NULL) {
5754 uint32_t purgeable, wired, clean, dirty, state;
5755 uint32_t max_pages, pages, i;
5756 boolean_t shared;
5757
5758 aPid = p->p_pid;
5759 state = p->p_memstat_state;
5760
5761 /* Ensure the process is eligible for freezing */
5762 if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FROZEN)) || !(state & P_MEMSTAT_SUSPENDED)) {
5763 proc_list_unlock();
5764 goto exit;
5765 }
5766
5767 /* Only freeze processes meeting our minimum resident page criteria */
5768 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
5769 if (pages < memorystatus_freeze_pages_min) {
5770 proc_list_unlock();
5771 goto exit;
5772 }
5773
5774 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
5775
5776 unsigned int avail_swap_space = 0; /* in pages. */
5777
5778 /*
5779 * Freezer backed by the compressor and swap file(s)
5780 * while will hold compressed data.
5781 */
5782 avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64;
5783
5784 max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max);
5785
5786 if (max_pages < memorystatus_freeze_pages_min) {
5787 proc_list_unlock();
5788 goto exit;
5789 }
5790 } else {
5791 /*
5792 * We only have the compressor without any swap.
5793 */
5794 max_pages = UINT32_MAX - 1;
5795 }
5796
5797 /* Mark as locked temporarily to avoid kill */
5798 p->p_memstat_state |= P_MEMSTAT_LOCKED;
5799 proc_list_unlock();
5800
5801 ret = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
5802
5803 DTRACE_MEMORYSTATUS6(memorystatus_freeze, proc_t, p, unsigned int, memorystatus_available_pages, boolean_t, purgeable, unsigned int, wired, uint32_t, clean, uint32_t, dirty);
5804
5805 MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_process_sync: task_freeze %s for pid %d [%s] - "
5806 "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, max_pages %d, shared %d\n",
5807 (ret == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (*p->p_name ? p->p_name : "(unknown)"),
5808 memorystatus_available_pages, purgeable, wired, clean, dirty, max_pages, shared);
5809
5810 proc_list_lock();
5811 p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
5812
5813 if (ret == KERN_SUCCESS) {
5814 memorystatus_freeze_entry_t data = { aPid, TRUE, dirty };
5815
5816 memorystatus_frozen_count++;
5817
5818 p->p_memstat_state |= (P_MEMSTAT_FROZEN | (shared ? 0: P_MEMSTAT_NORECLAIM));
5819
5820 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
5821 /* Update stats */
5822 for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
5823 throttle_intervals[i].pageouts += dirty;
5824 }
5825 }
5826
5827 memorystatus_freeze_pageouts += dirty;
5828 memorystatus_freeze_count++;
5829
5830 proc_list_unlock();
5831
5832 memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
5833 } else {
5834 proc_list_unlock();
5835 }
5836 }
5837
5838 exit:
5839 lck_mtx_unlock(&freezer_mutex);
5840 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END,
5841 memorystatus_available_pages, aPid, 0, 0, 0);
5842
5843 return ret;
5844 }
5845
5846 static int
5847 memorystatus_freeze_top_process(boolean_t *memorystatus_freeze_swap_low)
5848 {
5849 pid_t aPid = 0;
5850 int ret = -1;
5851 proc_t p = PROC_NULL, next_p = PROC_NULL;
5852 unsigned int i = 0;
5853
5854 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START,
5855 memorystatus_available_pages, 0, 0, 0, 0);
5856
5857 proc_list_lock();
5858
5859 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5860 while (next_p) {
5861 kern_return_t kr;
5862 uint32_t purgeable, wired, clean, dirty;
5863 boolean_t shared;
5864 uint32_t pages;
5865 uint32_t max_pages = 0;
5866 uint32_t state;
5867
5868 p = next_p;
5869 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
5870
5871 aPid = p->p_pid;
5872 state = p->p_memstat_state;
5873
5874 /* Ensure the process is eligible for freezing */
5875 if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FROZEN)) || !(state & P_MEMSTAT_SUSPENDED)) {
5876 continue; // with lock held
5877 }
5878
5879 /* Only freeze processes meeting our minimum resident page criteria */
5880 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
5881 if (pages < memorystatus_freeze_pages_min) {
5882 continue; // with lock held
5883 }
5884
5885 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
5886
5887 /* Ensure there's enough free space to freeze this process. */
5888
5889 unsigned int avail_swap_space = 0; /* in pages. */
5890
5891 /*
5892 * Freezer backed by the compressor and swap file(s)
5893 * while will hold compressed data.
5894 */
5895 avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64;
5896
5897 max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max);
5898
5899 if (max_pages < memorystatus_freeze_pages_min) {
5900 *memorystatus_freeze_swap_low = TRUE;
5901 proc_list_unlock();
5902 goto exit;
5903 }
5904 } else {
5905 /*
5906 * We only have the compressor pool.
5907 */
5908 max_pages = UINT32_MAX - 1;
5909 }
5910
5911 /* Mark as locked temporarily to avoid kill */
5912 p->p_memstat_state |= P_MEMSTAT_LOCKED;
5913
5914 p = proc_ref_locked(p);
5915 proc_list_unlock();
5916 if (!p) {
5917 goto exit;
5918 }
5919
5920 kr = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
5921
5922 MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_top_process: task_freeze %s for pid %d [%s] - "
5923 "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, max_pages %d, shared %d\n",
5924 (kr == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (*p->p_name ? p->p_name : "(unknown)"),
5925 memorystatus_available_pages, purgeable, wired, clean, dirty, max_pages, shared);
5926
5927 proc_list_lock();
5928 p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
5929
5930 /* Success? */
5931 if (KERN_SUCCESS == kr) {
5932 memorystatus_freeze_entry_t data = { aPid, TRUE, dirty };
5933
5934 memorystatus_frozen_count++;
5935
5936 p->p_memstat_state |= (P_MEMSTAT_FROZEN | (shared ? 0: P_MEMSTAT_NORECLAIM));
5937
5938 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
5939 /* Update stats */
5940 for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
5941 throttle_intervals[i].pageouts += dirty;
5942 }
5943 }
5944
5945 memorystatus_freeze_pageouts += dirty;
5946 memorystatus_freeze_count++;
5947
5948 proc_list_unlock();
5949
5950 memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
5951
5952 /* Return KERN_SUCESS */
5953 ret = kr;
5954
5955 } else {
5956 proc_list_unlock();
5957 }
5958
5959 proc_rele(p);
5960 goto exit;
5961 }
5962
5963 proc_list_unlock();
5964
5965 exit:
5966 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END,
5967 memorystatus_available_pages, aPid, 0, 0, 0);
5968
5969 return ret;
5970 }
5971
5972 static inline boolean_t
5973 memorystatus_can_freeze_processes(void)
5974 {
5975 boolean_t ret;
5976
5977 proc_list_lock();
5978
5979 if (memorystatus_suspended_count) {
5980 uint32_t average_resident_pages, estimated_processes;
5981
5982 /* Estimate the number of suspended processes we can fit */
5983 average_resident_pages = memorystatus_suspended_footprint_total / memorystatus_suspended_count;
5984 estimated_processes = memorystatus_suspended_count +
5985 ((memorystatus_available_pages - memorystatus_available_pages_critical) / average_resident_pages);
5986
5987 /* If it's predicted that no freeze will occur, lower the threshold temporarily */
5988 if (estimated_processes <= FREEZE_SUSPENDED_THRESHOLD_DEFAULT) {
5989 memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_LOW;
5990 } else {
5991 memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT;
5992 }
5993
5994 MEMORYSTATUS_DEBUG(1, "memorystatus_can_freeze_processes: %d suspended processes, %d average resident pages / process, %d suspended processes estimated\n",
5995 memorystatus_suspended_count, average_resident_pages, estimated_processes);
5996
5997 if ((memorystatus_suspended_count - memorystatus_frozen_count) > memorystatus_freeze_suspended_threshold) {
5998 ret = TRUE;
5999 } else {
6000 ret = FALSE;
6001 }
6002 } else {
6003 ret = FALSE;
6004 }
6005
6006 proc_list_unlock();
6007
6008 return ret;
6009 }
6010
6011 static boolean_t
6012 memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low)
6013 {
6014 boolean_t can_freeze = TRUE;
6015
6016 /* Only freeze if we're sufficiently low on memory; this holds off freeze right
6017 after boot, and is generally is a no-op once we've reached steady state. */
6018 if (memorystatus_available_pages > memorystatus_freeze_threshold) {
6019 return FALSE;
6020 }
6021
6022 /* Check minimum suspended process threshold. */
6023 if (!memorystatus_can_freeze_processes()) {
6024 return FALSE;
6025 }
6026 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
6027
6028 if ( !VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
6029 /*
6030 * In-core compressor used for freezing WITHOUT on-disk swap support.
6031 */
6032 if (vm_compressor_low_on_space()) {
6033 if (*memorystatus_freeze_swap_low) {
6034 *memorystatus_freeze_swap_low = TRUE;
6035 }
6036
6037 can_freeze = FALSE;
6038
6039 } else {
6040 if (*memorystatus_freeze_swap_low) {
6041 *memorystatus_freeze_swap_low = FALSE;
6042 }
6043
6044 can_freeze = TRUE;
6045 }
6046 } else {
6047 /*
6048 * Freezing WITH on-disk swap support.
6049 *
6050 * In-core compressor fronts the swap.
6051 */
6052 if (vm_swap_low_on_space()) {
6053 if (*memorystatus_freeze_swap_low) {
6054 *memorystatus_freeze_swap_low = TRUE;
6055 }
6056
6057 can_freeze = FALSE;
6058 }
6059
6060 }
6061
6062 return can_freeze;
6063 }
6064
6065 static void
6066 memorystatus_freeze_update_throttle_interval(mach_timespec_t *ts, struct throttle_interval_t *interval)
6067 {
6068 unsigned int freeze_daily_pageouts_max = memorystatus_freeze_daily_mb_max * (1024 * 1024 / PAGE_SIZE);
6069 if (CMP_MACH_TIMESPEC(ts, &interval->ts) >= 0) {
6070 if (!interval->max_pageouts) {
6071 interval->max_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * freeze_daily_pageouts_max) / (24 * 60)));
6072 } else {
6073 printf("memorystatus_freeze_update_throttle_interval: %d minute throttle timeout, resetting\n", interval->mins);
6074 }
6075 interval->ts.tv_sec = interval->mins * 60;
6076 interval->ts.tv_nsec = 0;
6077 ADD_MACH_TIMESPEC(&interval->ts, ts);
6078 /* Since we update the throttle stats pre-freeze, adjust for overshoot here */
6079 if (interval->pageouts > interval->max_pageouts) {
6080 interval->pageouts -= interval->max_pageouts;
6081 } else {
6082 interval->pageouts = 0;
6083 }
6084 interval->throttle = FALSE;
6085 } else if (!interval->throttle && interval->pageouts >= interval->max_pageouts) {
6086 printf("memorystatus_freeze_update_throttle_interval: %d minute pageout limit exceeded; enabling throttle\n", interval->mins);
6087 interval->throttle = TRUE;
6088 }
6089
6090 MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_update_throttle_interval: throttle updated - %d frozen (%d max) within %dm; %dm remaining; throttle %s\n",
6091 interval->pageouts, interval->max_pageouts, interval->mins, (interval->ts.tv_sec - ts->tv_sec) / 60,
6092 interval->throttle ? "on" : "off");
6093 }
6094
6095 static boolean_t
6096 memorystatus_freeze_update_throttle(void)
6097 {
6098 clock_sec_t sec;
6099 clock_nsec_t nsec;
6100 mach_timespec_t ts;
6101 uint32_t i;
6102 boolean_t throttled = FALSE;
6103
6104 #if DEVELOPMENT || DEBUG
6105 if (!memorystatus_freeze_throttle_enabled)
6106 return FALSE;
6107 #endif
6108
6109 clock_get_system_nanotime(&sec, &nsec);
6110 ts.tv_sec = sec;
6111 ts.tv_nsec = nsec;
6112
6113 /* Check freeze pageouts over multiple intervals and throttle if we've exceeded our budget.
6114 *
6115 * This ensures that periods of inactivity can't be used as 'credit' towards freeze if the device has
6116 * remained dormant for a long period. We do, however, allow increased thresholds for shorter intervals in
6117 * order to allow for bursts of activity.
6118 */
6119 for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
6120 memorystatus_freeze_update_throttle_interval(&ts, &throttle_intervals[i]);
6121 if (throttle_intervals[i].throttle == TRUE)
6122 throttled = TRUE;
6123 }
6124
6125 return throttled;
6126 }
6127
6128 static void
6129 memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused)
6130 {
6131 static boolean_t memorystatus_freeze_swap_low = FALSE;
6132
6133 lck_mtx_lock(&freezer_mutex);
6134 if (memorystatus_freeze_enabled) {
6135 if (memorystatus_can_freeze(&memorystatus_freeze_swap_low)) {
6136 /* Only freeze if we've not exceeded our pageout budgets.*/
6137 if (!memorystatus_freeze_update_throttle()) {
6138 memorystatus_freeze_top_process(&memorystatus_freeze_swap_low);
6139 } else {
6140 printf("memorystatus_freeze_thread: in throttle, ignoring freeze\n");
6141 memorystatus_freeze_throttle_count++; /* Throttled, update stats */
6142 }
6143 }
6144 }
6145 lck_mtx_unlock(&freezer_mutex);
6146
6147 assert_wait((event_t) &memorystatus_freeze_wakeup, THREAD_UNINT);
6148 thread_block((thread_continue_t) memorystatus_freeze_thread);
6149 }
6150
6151 static int
6152 sysctl_memorystatus_do_fastwake_warmup_all SYSCTL_HANDLER_ARGS
6153 {
6154 #pragma unused(oidp, req, arg1, arg2)
6155
6156 /* Need to be root or have entitlement */
6157 if (!kauth_cred_issuser(kauth_cred_get()) && !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT)) {
6158 return EPERM;
6159 }
6160
6161 if (memorystatus_freeze_enabled == FALSE) {
6162 return ENOTSUP;
6163 }
6164
6165 do_fastwake_warmup_all();
6166
6167 return 0;
6168 }
6169
6170 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_do_fastwake_warmup_all, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
6171 0, 0, &sysctl_memorystatus_do_fastwake_warmup_all, "I", "");
6172
6173 #endif /* CONFIG_FREEZE */
6174
6175 #if VM_PRESSURE_EVENTS
6176
6177 #if CONFIG_MEMORYSTATUS
6178
6179 static int
6180 memorystatus_send_note(int event_code, void *data, size_t data_length) {
6181 int ret;
6182 struct kev_msg ev_msg;
6183
6184 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6185 ev_msg.kev_class = KEV_SYSTEM_CLASS;
6186 ev_msg.kev_subclass = KEV_MEMORYSTATUS_SUBCLASS;
6187
6188 ev_msg.event_code = event_code;
6189
6190 ev_msg.dv[0].data_length = data_length;
6191 ev_msg.dv[0].data_ptr = data;
6192 ev_msg.dv[1].data_length = 0;
6193
6194 ret = kev_post_msg(&ev_msg);
6195 if (ret) {
6196 printf("%s: kev_post_msg() failed, err %d\n", __func__, ret);
6197 }
6198
6199 return ret;
6200 }
6201
6202 boolean_t
6203 memorystatus_warn_process(pid_t pid, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t limit_exceeded) {
6204
6205 boolean_t ret = FALSE;
6206 boolean_t found_knote = FALSE;
6207 struct knote *kn = NULL;
6208 int send_knote_count = 0;
6209
6210 /*
6211 * See comment in sysctl_memorystatus_vm_pressure_send.
6212 */
6213
6214 memorystatus_klist_lock();
6215
6216 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
6217 proc_t knote_proc = knote_get_kq(kn)->kq_p;
6218 pid_t knote_pid = knote_proc->p_pid;
6219
6220 if (knote_pid == pid) {
6221 /*
6222 * By setting the "fflags" here, we are forcing
6223 * a process to deal with the case where it's
6224 * bumping up into its memory limits. If we don't
6225 * do this here, we will end up depending on the
6226 * system pressure snapshot evaluation in
6227 * filt_memorystatus().
6228 */
6229
6230 #if CONFIG_EMBEDDED
6231 if (!limit_exceeded) {
6232 /*
6233 * Intentionally set either the unambiguous limit warning,
6234 * the system-wide critical or the system-wide warning
6235 * notification bit.
6236 */
6237
6238 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
6239 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
6240 found_knote = TRUE;
6241 send_knote_count++;
6242 } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
6243 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
6244 found_knote = TRUE;
6245 send_knote_count++;
6246 } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
6247 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
6248 found_knote = TRUE;
6249 send_knote_count++;
6250 }
6251 } else {
6252 /*
6253 * Send this notification when a process has exceeded a soft limit.
6254 */
6255 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
6256 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
6257 found_knote = TRUE;
6258 send_knote_count++;
6259 }
6260 }
6261 #else /* CONFIG_EMBEDDED */
6262 if (!limit_exceeded) {
6263
6264 /*
6265 * Processes on desktop are not expecting to handle a system-wide
6266 * critical or system-wide warning notification from this path.
6267 * Intentionally set only the unambiguous limit warning here.
6268 *
6269 * If the limit is soft, however, limit this to one notification per
6270 * active/inactive limit (per each registered listener).
6271 */
6272
6273 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
6274 found_knote=TRUE;
6275 if (!is_fatal) {
6276 /*
6277 * Restrict proc_limit_warn notifications when
6278 * non-fatal (soft) limit is at play.
6279 */
6280 if (is_active) {
6281 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
6282 /*
6283 * Mark this knote for delivery.
6284 */
6285 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
6286 /*
6287 * And suppress it from future notifications.
6288 */
6289 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
6290 send_knote_count++;
6291 }
6292 } else {
6293 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
6294 /*
6295 * Mark this knote for delivery.
6296 */
6297 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
6298 /*
6299 * And suppress it from future notifications.
6300 */
6301 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
6302 send_knote_count++;
6303 }
6304 }
6305 } else {
6306 /*
6307 * No restriction on proc_limit_warn notifications when
6308 * fatal (hard) limit is at play.
6309 */
6310 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
6311 send_knote_count++;
6312 }
6313 }
6314 } else {
6315 /*
6316 * Send this notification when a process has exceeded a soft limit,
6317 */
6318
6319 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
6320 found_knote = TRUE;
6321 if (!is_fatal) {
6322 /*
6323 * Restrict critical notifications for soft limits.
6324 */
6325
6326 if (is_active) {
6327 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
6328 /*
6329 * Suppress future proc_limit_critical notifications
6330 * for the active soft limit.
6331 */
6332 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
6333 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
6334 send_knote_count++;
6335
6336 }
6337 } else {
6338 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
6339 /*
6340 * Suppress future proc_limit_critical_notifications
6341 * for the inactive soft limit.
6342 */
6343 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
6344 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
6345 send_knote_count++;
6346 }
6347 }
6348 } else {
6349 /*
6350 * We should never be trying to send a critical notification for
6351 * a hard limit... the process would be killed before it could be
6352 * received.
6353 */
6354 panic("Caught sending pid %d a critical warning for a fatal limit.\n", pid);
6355 }
6356 }
6357 }
6358 #endif /* CONFIG_EMBEDDED */
6359 }
6360 }
6361
6362 if (found_knote) {
6363 if (send_knote_count > 0) {
6364 KNOTE(&memorystatus_klist, 0);
6365 }
6366 ret = TRUE;
6367 }
6368
6369 memorystatus_klist_unlock();
6370
6371 return ret;
6372 }
6373
6374 /*
6375 * Can only be set by the current task on itself.
6376 */
6377 int
6378 memorystatus_low_mem_privileged_listener(uint32_t op_flags)
6379 {
6380 boolean_t set_privilege = FALSE;
6381 /*
6382 * Need an entitlement check here?
6383 */
6384 if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE) {
6385 set_privilege = TRUE;
6386 } else if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE) {
6387 set_privilege = FALSE;
6388 } else {
6389 return EINVAL;
6390 }
6391
6392 return (task_low_mem_privileged_listener(current_task(), set_privilege, NULL));
6393 }
6394
6395 int
6396 memorystatus_send_pressure_note(pid_t pid) {
6397 MEMORYSTATUS_DEBUG(1, "memorystatus_send_pressure_note(): pid %d\n", pid);
6398 return memorystatus_send_note(kMemorystatusPressureNote, &pid, sizeof(pid));
6399 }
6400
6401 void
6402 memorystatus_send_low_swap_note(void) {
6403
6404 struct knote *kn = NULL;
6405
6406 memorystatus_klist_lock();
6407 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
6408 /* We call is_knote_registered_modify_task_pressure_bits to check if the sfflags for the
6409 * current note contain NOTE_MEMORYSTATUS_LOW_SWAP. Once we find one note in the memorystatus_klist
6410 * that has the NOTE_MEMORYSTATUS_LOW_SWAP flags in its sfflags set, we call KNOTE with
6411 * kMemoryStatusLowSwap as the hint to process and update all knotes on the memorystatus_klist accordingly. */
6412 if (is_knote_registered_modify_task_pressure_bits(kn, NOTE_MEMORYSTATUS_LOW_SWAP, NULL, 0, 0) == TRUE) {
6413 KNOTE(&memorystatus_klist, kMemorystatusLowSwap);
6414 break;
6415 }
6416 }
6417
6418 memorystatus_klist_unlock();
6419 }
6420
6421 boolean_t
6422 memorystatus_bg_pressure_eligible(proc_t p) {
6423 boolean_t eligible = FALSE;
6424
6425 proc_list_lock();
6426
6427 MEMORYSTATUS_DEBUG(1, "memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", p->p_pid, p->p_memstat_state);
6428
6429 /* Foreground processes have already been dealt with at this point, so just test for eligibility */
6430 if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) {
6431 eligible = TRUE;
6432 }
6433
6434 proc_list_unlock();
6435
6436 return eligible;
6437 }
6438
6439 boolean_t
6440 memorystatus_is_foreground_locked(proc_t p) {
6441 return ((p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) ||
6442 (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT));
6443 }
6444
6445 /*
6446 * This is meant for stackshot and kperf -- it does not take the proc_list_lock
6447 * to access the p_memstat_dirty field.
6448 */
6449 boolean_t
6450 memorystatus_proc_is_dirty_unsafe(void *v)
6451 {
6452 if (!v) {
6453 return FALSE;
6454 }
6455 proc_t p = (proc_t)v;
6456 return (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0;
6457 }
6458
6459 #endif /* CONFIG_MEMORYSTATUS */
6460
6461 /*
6462 * Trigger levels to test the mechanism.
6463 * Can be used via a sysctl.
6464 */
6465 #define TEST_LOW_MEMORY_TRIGGER_ONE 1
6466 #define TEST_LOW_MEMORY_TRIGGER_ALL 2
6467 #define TEST_PURGEABLE_TRIGGER_ONE 3
6468 #define TEST_PURGEABLE_TRIGGER_ALL 4
6469 #define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE 5
6470 #define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL 6
6471
6472 boolean_t memorystatus_manual_testing_on = FALSE;
6473 vm_pressure_level_t memorystatus_manual_testing_level = kVMPressureNormal;
6474
6475 extern struct knote *
6476 vm_pressure_select_optimal_candidate_to_notify(struct klist *, int, boolean_t);
6477
6478 /*
6479 * This value is the threshold that a process must meet to be considered for scavenging.
6480 */
6481 #if CONFIG_EMBEDDED
6482 #define VM_PRESSURE_MINIMUM_RSIZE 1 /* MB */
6483 #else /* CONFIG_EMBEDDED */
6484 #define VM_PRESSURE_MINIMUM_RSIZE 10 /* MB */
6485 #endif /* CONFIG_EMBEDDED */
6486
6487 #define VM_PRESSURE_NOTIFY_WAIT_PERIOD 10000 /* milliseconds */
6488
6489 #if DEBUG
6490 #define VM_PRESSURE_DEBUG(cond, format, ...) \
6491 do { \
6492 if (cond) { printf(format, ##__VA_ARGS__); } \
6493 } while(0)
6494 #else
6495 #define VM_PRESSURE_DEBUG(cond, format, ...)
6496 #endif
6497
6498 #define INTER_NOTIFICATION_DELAY (250000) /* .25 second */
6499
6500 void memorystatus_on_pageout_scan_end(void) {
6501 /* No-op */
6502 }
6503
6504 /*
6505 * kn_max - knote
6506 *
6507 * knote_pressure_level - to check if the knote is registered for this notification level.
6508 *
6509 * task - task whose bits we'll be modifying
6510 *
6511 * pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again.
6512 *
6513 * pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately.
6514 *
6515 */
6516
6517 boolean_t
6518 is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set)
6519 {
6520 if (kn_max->kn_sfflags & knote_pressure_level) {
6521
6522 if (pressure_level_to_clear && task_has_been_notified(task, pressure_level_to_clear) == TRUE) {
6523
6524 task_clear_has_been_notified(task, pressure_level_to_clear);
6525 }
6526
6527 task_mark_has_been_notified(task, pressure_level_to_set);
6528 return TRUE;
6529 }
6530
6531 return FALSE;
6532 }
6533
6534 void
6535 memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear)
6536 {
6537 struct knote *kn = NULL;
6538
6539 memorystatus_klist_lock();
6540 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
6541
6542 proc_t p = PROC_NULL;
6543 struct task* t = TASK_NULL;
6544
6545 p = knote_get_kq(kn)->kq_p;
6546 proc_list_lock();
6547 if (p != proc_ref_locked(p)) {
6548 p = PROC_NULL;
6549 proc_list_unlock();
6550 continue;
6551 }
6552 proc_list_unlock();
6553
6554 t = (struct task *)(p->task);
6555
6556 task_clear_has_been_notified(t, pressure_level_to_clear);
6557
6558 proc_rele(p);
6559 }
6560
6561 memorystatus_klist_unlock();
6562 }
6563
6564 extern kern_return_t vm_pressure_notify_dispatch_vm_clients(boolean_t target_foreground_process);
6565
6566 struct knote *
6567 vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process);
6568
6569 /*
6570 * Used by the vm_pressure_thread which is
6571 * signalled from within vm_pageout_scan().
6572 */
6573 static void vm_dispatch_memory_pressure(void);
6574 void consider_vm_pressure_events(void);
6575
6576 void consider_vm_pressure_events(void)
6577 {
6578 vm_dispatch_memory_pressure();
6579 }
6580 static void vm_dispatch_memory_pressure(void)
6581 {
6582 memorystatus_update_vm_pressure(FALSE);
6583 }
6584
6585 extern vm_pressure_level_t
6586 convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
6587
6588 struct knote *
6589 vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process)
6590 {
6591 struct knote *kn = NULL, *kn_max = NULL;
6592 uint64_t resident_max = 0; /* MB */
6593 struct timeval curr_tstamp = {0, 0};
6594 int elapsed_msecs = 0;
6595 int selected_task_importance = 0;
6596 static int pressure_snapshot = -1;
6597 boolean_t pressure_increase = FALSE;
6598
6599 if (pressure_snapshot == -1) {
6600 /*
6601 * Initial snapshot.
6602 */
6603 pressure_snapshot = level;
6604 pressure_increase = TRUE;
6605 } else {
6606
6607 if (level && (level >= pressure_snapshot)) {
6608 pressure_increase = TRUE;
6609 } else {
6610 pressure_increase = FALSE;
6611 }
6612
6613 pressure_snapshot = level;
6614 }
6615
6616 if (pressure_increase == TRUE) {
6617 /*
6618 * We'll start by considering the largest
6619 * unimportant task in our list.
6620 */
6621 selected_task_importance = INT_MAX;
6622 } else {
6623 /*
6624 * We'll start by considering the largest
6625 * important task in our list.
6626 */
6627 selected_task_importance = 0;
6628 }
6629
6630 microuptime(&curr_tstamp);
6631
6632 SLIST_FOREACH(kn, candidate_list, kn_selnext) {
6633
6634 uint64_t resident_size = 0; /* MB */
6635 proc_t p = PROC_NULL;
6636 struct task* t = TASK_NULL;
6637 int curr_task_importance = 0;
6638 boolean_t consider_knote = FALSE;
6639 boolean_t privileged_listener = FALSE;
6640
6641 p = knote_get_kq(kn)->kq_p;
6642 proc_list_lock();
6643 if (p != proc_ref_locked(p)) {
6644 p = PROC_NULL;
6645 proc_list_unlock();
6646 continue;
6647 }
6648 proc_list_unlock();
6649
6650 #if CONFIG_MEMORYSTATUS
6651 if (target_foreground_process == TRUE && !memorystatus_is_foreground_locked(p)) {
6652 /*
6653 * Skip process not marked foreground.
6654 */
6655 proc_rele(p);
6656 continue;
6657 }
6658 #endif /* CONFIG_MEMORYSTATUS */
6659
6660 t = (struct task *)(p->task);
6661
6662 timevalsub(&curr_tstamp, &p->vm_pressure_last_notify_tstamp);
6663 elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
6664
6665 vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(level);
6666
6667 if ((kn->kn_sfflags & dispatch_level) == 0) {
6668 proc_rele(p);
6669 continue;
6670 }
6671
6672 #if CONFIG_MEMORYSTATUS
6673 if (target_foreground_process == FALSE && !memorystatus_bg_pressure_eligible(p)) {
6674 VM_PRESSURE_DEBUG(1, "[vm_pressure] skipping process %d\n", p->p_pid);
6675 proc_rele(p);
6676 continue;
6677 }
6678 #endif /* CONFIG_MEMORYSTATUS */
6679
6680 #if CONFIG_EMBEDDED
6681 curr_task_importance = p->p_memstat_effectivepriority;
6682 #else /* CONFIG_EMBEDDED */
6683 curr_task_importance = task_importance_estimate(t);
6684 #endif /* CONFIG_EMBEDDED */
6685
6686 /*
6687 * Privileged listeners are only considered in the multi-level pressure scheme
6688 * AND only if the pressure is increasing.
6689 */
6690 if (level > 0) {
6691
6692 if (task_has_been_notified(t, level) == FALSE) {
6693
6694 /*
6695 * Is this a privileged listener?
6696 */
6697 if (task_low_mem_privileged_listener(t, FALSE, &privileged_listener) == 0) {
6698
6699 if (privileged_listener) {
6700 kn_max = kn;
6701 proc_rele(p);
6702 goto done_scanning;
6703 }
6704 }
6705 } else {
6706 proc_rele(p);
6707 continue;
6708 }
6709 } else if (level == 0) {
6710
6711 /*
6712 * Task wasn't notified when the pressure was increasing and so
6713 * no need to notify it that the pressure is decreasing.
6714 */
6715 if ((task_has_been_notified(t, kVMPressureWarning) == FALSE) && (task_has_been_notified(t, kVMPressureCritical) == FALSE)) {
6716 proc_rele(p);
6717 continue;
6718 }
6719 }
6720
6721 /*
6722 * We don't want a small process to block large processes from
6723 * being notified again. <rdar://problem/7955532>
6724 */
6725 resident_size = (get_task_phys_footprint(t))/(1024*1024ULL); /* MB */
6726
6727 if (resident_size >= VM_PRESSURE_MINIMUM_RSIZE) {
6728
6729 if (level > 0) {
6730 /*
6731 * Warning or Critical Pressure.
6732 */
6733 if (pressure_increase) {
6734 if ((curr_task_importance < selected_task_importance) ||
6735 ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
6736
6737 /*
6738 * We have found a candidate process which is:
6739 * a) at a lower importance than the current selected process
6740 * OR
6741 * b) has importance equal to that of the current selected process but is larger
6742 */
6743
6744 consider_knote = TRUE;
6745 }
6746 } else {
6747 if ((curr_task_importance > selected_task_importance) ||
6748 ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
6749
6750 /*
6751 * We have found a candidate process which is:
6752 * a) at a higher importance than the current selected process
6753 * OR
6754 * b) has importance equal to that of the current selected process but is larger
6755 */
6756
6757 consider_knote = TRUE;
6758 }
6759 }
6760 } else if (level == 0) {
6761 /*
6762 * Pressure back to normal.
6763 */
6764 if ((curr_task_importance > selected_task_importance) ||
6765 ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
6766
6767 consider_knote = TRUE;
6768 }
6769 }
6770
6771 if (consider_knote) {
6772 resident_max = resident_size;
6773 kn_max = kn;
6774 selected_task_importance = curr_task_importance;
6775 consider_knote = FALSE; /* reset for the next candidate */
6776 }
6777 } else {
6778 /* There was no candidate with enough resident memory to scavenge */
6779 VM_PRESSURE_DEBUG(0, "[vm_pressure] threshold failed for pid %d with %llu resident...\n", p->p_pid, resident_size);
6780 }
6781 proc_rele(p);
6782 }
6783
6784 done_scanning:
6785 if (kn_max) {
6786 VM_DEBUG_CONSTANT_EVENT(vm_pressure_event, VM_PRESSURE_EVENT, DBG_FUNC_NONE, knote_get_kq(kn_max)->kq_p->p_pid, resident_max, 0, 0);
6787 VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %llu resident\n", knote_get_kq(kn_max)->kq_p->p_pid, resident_max);
6788 }
6789
6790 return kn_max;
6791 }
6792
6793 #define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD 5000 /* milliseconds */
6794 #define WARNING_NOTIFICATION_RESTING_PERIOD 25 /* seconds */
6795 #define CRITICAL_NOTIFICATION_RESTING_PERIOD 25 /* seconds */
6796
6797 uint64_t next_warning_notification_sent_at_ts = 0;
6798 uint64_t next_critical_notification_sent_at_ts = 0;
6799
6800 kern_return_t
6801 memorystatus_update_vm_pressure(boolean_t target_foreground_process)
6802 {
6803 struct knote *kn_max = NULL;
6804 struct knote *kn_cur = NULL, *kn_temp = NULL; /* for safe list traversal */
6805 pid_t target_pid = -1;
6806 struct klist dispatch_klist = { NULL };
6807 proc_t target_proc = PROC_NULL;
6808 struct task *task = NULL;
6809 boolean_t found_candidate = FALSE;
6810
6811 static vm_pressure_level_t level_snapshot = kVMPressureNormal;
6812 static vm_pressure_level_t prev_level_snapshot = kVMPressureNormal;
6813 boolean_t smoothing_window_started = FALSE;
6814 struct timeval smoothing_window_start_tstamp = {0, 0};
6815 struct timeval curr_tstamp = {0, 0};
6816 int elapsed_msecs = 0;
6817 uint64_t curr_ts = mach_absolute_time();
6818
6819 #if !CONFIG_JETSAM
6820 #define MAX_IDLE_KILLS 100 /* limit the number of idle kills allowed */
6821
6822 int idle_kill_counter = 0;
6823
6824 /*
6825 * On desktop we take this opportunity to free up memory pressure
6826 * by immediately killing idle exitable processes. We use a delay
6827 * to avoid overkill. And we impose a max counter as a fail safe
6828 * in case daemons re-launch too fast.
6829 */
6830 while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) {
6831 if (memorystatus_idle_exit_from_VM() == FALSE) {
6832 /* No idle exitable processes left to kill */
6833 break;
6834 }
6835 idle_kill_counter++;
6836
6837 if (memorystatus_manual_testing_on == TRUE) {
6838 /*
6839 * Skip the delay when testing
6840 * the pressure notification scheme.
6841 */
6842 } else {
6843 delay(1000000); /* 1 second */
6844 }
6845 }
6846 #endif /* !CONFIG_JETSAM */
6847
6848 if (level_snapshot != kVMPressureNormal) {
6849
6850 /*
6851 * Check to see if we are still in the 'resting' period
6852 * after having notified all clients interested in
6853 * a particular pressure level.
6854 */
6855
6856 level_snapshot = memorystatus_vm_pressure_level;
6857
6858 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
6859
6860 if (next_warning_notification_sent_at_ts) {
6861 if (curr_ts < next_warning_notification_sent_at_ts) {
6862 delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
6863 return KERN_SUCCESS;
6864 }
6865
6866 next_warning_notification_sent_at_ts = 0;
6867 memorystatus_klist_reset_all_for_level(kVMPressureWarning);
6868 }
6869 } else if (level_snapshot == kVMPressureCritical) {
6870
6871 if (next_critical_notification_sent_at_ts) {
6872 if (curr_ts < next_critical_notification_sent_at_ts) {
6873 delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
6874 return KERN_SUCCESS;
6875 }
6876 next_critical_notification_sent_at_ts = 0;
6877 memorystatus_klist_reset_all_for_level(kVMPressureCritical);
6878 }
6879 }
6880 }
6881
6882 while (1) {
6883
6884 /*
6885 * There is a race window here. But it's not clear
6886 * how much we benefit from having extra synchronization.
6887 */
6888 level_snapshot = memorystatus_vm_pressure_level;
6889
6890 if (prev_level_snapshot > level_snapshot) {
6891 /*
6892 * Pressure decreased? Let's take a little breather
6893 * and see if this condition stays.
6894 */
6895 if (smoothing_window_started == FALSE) {
6896
6897 smoothing_window_started = TRUE;
6898 microuptime(&smoothing_window_start_tstamp);
6899 }
6900
6901 microuptime(&curr_tstamp);
6902 timevalsub(&curr_tstamp, &smoothing_window_start_tstamp);
6903 elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
6904
6905 if (elapsed_msecs < VM_PRESSURE_DECREASED_SMOOTHING_PERIOD) {
6906
6907 delay(INTER_NOTIFICATION_DELAY);
6908 continue;
6909 }
6910 }
6911
6912 prev_level_snapshot = level_snapshot;
6913 smoothing_window_started = FALSE;
6914
6915 memorystatus_klist_lock();
6916 kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot, target_foreground_process);
6917
6918 if (kn_max == NULL) {
6919 memorystatus_klist_unlock();
6920
6921 /*
6922 * No more level-based clients to notify.
6923 *
6924 * Start the 'resting' window within which clients will not be re-notified.
6925 */
6926
6927 if (level_snapshot != kVMPressureNormal) {
6928 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
6929 nanoseconds_to_absolutetime(WARNING_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
6930
6931 /* Next warning notification (if nothing changes) won't be sent before...*/
6932 next_warning_notification_sent_at_ts = mach_absolute_time() + curr_ts;
6933 }
6934
6935 if (level_snapshot == kVMPressureCritical) {
6936 nanoseconds_to_absolutetime(CRITICAL_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
6937
6938 /* Next critical notification (if nothing changes) won't be sent before...*/
6939 next_critical_notification_sent_at_ts = mach_absolute_time() + curr_ts;
6940 }
6941 }
6942 return KERN_FAILURE;
6943 }
6944
6945 target_proc = knote_get_kq(kn_max)->kq_p;
6946
6947 proc_list_lock();
6948 if (target_proc != proc_ref_locked(target_proc)) {
6949 target_proc = PROC_NULL;
6950 proc_list_unlock();
6951 memorystatus_klist_unlock();
6952 continue;
6953 }
6954 proc_list_unlock();
6955
6956 target_pid = target_proc->p_pid;
6957
6958 task = (struct task *)(target_proc->task);
6959
6960 if (level_snapshot != kVMPressureNormal) {
6961
6962 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
6963
6964 if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, 0, kVMPressureWarning) == TRUE) {
6965 found_candidate = TRUE;
6966 }
6967 } else {
6968 if (level_snapshot == kVMPressureCritical) {
6969
6970 if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, 0, kVMPressureCritical) == TRUE) {
6971 found_candidate = TRUE;
6972 }
6973 }
6974 }
6975 } else {
6976 if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
6977
6978 task_clear_has_been_notified(task, kVMPressureWarning);
6979 task_clear_has_been_notified(task, kVMPressureCritical);
6980
6981 found_candidate = TRUE;
6982 }
6983 }
6984
6985 if (found_candidate == FALSE) {
6986 proc_rele(target_proc);
6987 memorystatus_klist_unlock();
6988 continue;
6989 }
6990
6991 SLIST_FOREACH_SAFE(kn_cur, &memorystatus_klist, kn_selnext, kn_temp) {
6992
6993 int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level_snapshot);
6994
6995 if (is_knote_registered_modify_task_pressure_bits(kn_cur, knote_pressure_level, task, 0, level_snapshot) == TRUE) {
6996 proc_t knote_proc = knote_get_kq(kn_cur)->kq_p;
6997 pid_t knote_pid = knote_proc->p_pid;
6998 if (knote_pid == target_pid) {
6999 KNOTE_DETACH(&memorystatus_klist, kn_cur);
7000 KNOTE_ATTACH(&dispatch_klist, kn_cur);
7001 }
7002 }
7003 }
7004
7005 KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure);
7006
7007 SLIST_FOREACH_SAFE(kn_cur, &dispatch_klist, kn_selnext, kn_temp) {
7008 KNOTE_DETACH(&dispatch_klist, kn_cur);
7009 KNOTE_ATTACH(&memorystatus_klist, kn_cur);
7010 }
7011
7012 memorystatus_klist_unlock();
7013
7014 microuptime(&target_proc->vm_pressure_last_notify_tstamp);
7015 proc_rele(target_proc);
7016
7017 if (memorystatus_manual_testing_on == TRUE && target_foreground_process == TRUE) {
7018 break;
7019 }
7020
7021 if (memorystatus_manual_testing_on == TRUE) {
7022 /*
7023 * Testing out the pressure notification scheme.
7024 * No need for delays etc.
7025 */
7026 } else {
7027
7028 uint32_t sleep_interval = INTER_NOTIFICATION_DELAY;
7029 #if CONFIG_JETSAM
7030 unsigned int page_delta = 0;
7031 unsigned int skip_delay_page_threshold = 0;
7032
7033 assert(memorystatus_available_pages_pressure >= memorystatus_available_pages_critical_base);
7034
7035 page_delta = (memorystatus_available_pages_pressure - memorystatus_available_pages_critical_base) / 2;
7036 skip_delay_page_threshold = memorystatus_available_pages_pressure - page_delta;
7037
7038 if (memorystatus_available_pages <= skip_delay_page_threshold) {
7039 /*
7040 * We are nearing the critcal mark fast and can't afford to wait between
7041 * notifications.
7042 */
7043 sleep_interval = 0;
7044 }
7045 #endif /* CONFIG_JETSAM */
7046
7047 if (sleep_interval) {
7048 delay(sleep_interval);
7049 }
7050 }
7051 }
7052
7053 return KERN_SUCCESS;
7054 }
7055
7056 vm_pressure_level_t
7057 convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)
7058 {
7059 vm_pressure_level_t dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
7060
7061 switch (internal_pressure_level) {
7062
7063 case kVMPressureNormal:
7064 {
7065 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
7066 break;
7067 }
7068
7069 case kVMPressureWarning:
7070 case kVMPressureUrgent:
7071 {
7072 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN;
7073 break;
7074 }
7075
7076 case kVMPressureCritical:
7077 {
7078 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
7079 break;
7080 }
7081
7082 default:
7083 break;
7084 }
7085
7086 return dispatch_level;
7087 }
7088
7089 static int
7090 sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS
7091 {
7092 #pragma unused(arg1, arg2, oidp)
7093 #if CONFIG_EMBEDDED
7094 int error = 0;
7095
7096 error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
7097 if (error)
7098 return (error);
7099
7100 #endif /* CONFIG_EMBEDDED */
7101 vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(memorystatus_vm_pressure_level);
7102
7103 return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level));
7104 }
7105
7106 #if DEBUG || DEVELOPMENT
7107
7108 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_LOCKED,
7109 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
7110
7111 #else /* DEBUG || DEVELOPMENT */
7112
7113 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_LOCKED|CTLFLAG_MASKED,
7114 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
7115
7116 #endif /* DEBUG || DEVELOPMENT */
7117
7118 extern int memorystatus_purge_on_warning;
7119 extern int memorystatus_purge_on_critical;
7120
7121 static int
7122 sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
7123 {
7124 #pragma unused(arg1, arg2)
7125
7126 int level = 0;
7127 int error = 0;
7128 int pressure_level = 0;
7129 int trigger_request = 0;
7130 int force_purge;
7131
7132 error = sysctl_handle_int(oidp, &level, 0, req);
7133 if (error || !req->newptr) {
7134 return (error);
7135 }
7136
7137 memorystatus_manual_testing_on = TRUE;
7138
7139 trigger_request = (level >> 16) & 0xFFFF;
7140 pressure_level = (level & 0xFFFF);
7141
7142 if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE ||
7143 trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) {
7144 return EINVAL;
7145 }
7146 switch (pressure_level) {
7147 case NOTE_MEMORYSTATUS_PRESSURE_NORMAL:
7148 case NOTE_MEMORYSTATUS_PRESSURE_WARN:
7149 case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL:
7150 break;
7151 default:
7152 return EINVAL;
7153 }
7154
7155 /*
7156 * The pressure level is being set from user-space.
7157 * And user-space uses the constants in sys/event.h
7158 * So we translate those events to our internal levels here.
7159 */
7160 if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
7161
7162 memorystatus_manual_testing_level = kVMPressureNormal;
7163 force_purge = 0;
7164
7165 } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) {
7166
7167 memorystatus_manual_testing_level = kVMPressureWarning;
7168 force_purge = memorystatus_purge_on_warning;
7169
7170 } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
7171
7172 memorystatus_manual_testing_level = kVMPressureCritical;
7173 force_purge = memorystatus_purge_on_critical;
7174 }
7175
7176 memorystatus_vm_pressure_level = memorystatus_manual_testing_level;
7177
7178 /* purge according to the new pressure level */
7179 switch (trigger_request) {
7180 case TEST_PURGEABLE_TRIGGER_ONE:
7181 case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE:
7182 if (force_purge == 0) {
7183 /* no purging requested */
7184 break;
7185 }
7186 vm_purgeable_object_purge_one_unlocked(force_purge);
7187 break;
7188 case TEST_PURGEABLE_TRIGGER_ALL:
7189 case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL:
7190 if (force_purge == 0) {
7191 /* no purging requested */
7192 break;
7193 }
7194 while (vm_purgeable_object_purge_one_unlocked(force_purge));
7195 break;
7196 }
7197
7198 if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) ||
7199 (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) {
7200
7201 memorystatus_update_vm_pressure(TRUE);
7202 }
7203
7204 if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) ||
7205 (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) {
7206
7207 while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) {
7208 continue;
7209 }
7210 }
7211
7212 if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
7213 memorystatus_manual_testing_on = FALSE;
7214 }
7215
7216 return 0;
7217 }
7218
7219 SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
7220 0, 0, &sysctl_memorypressure_manual_trigger, "I", "");
7221
7222
7223 extern int memorystatus_purge_on_warning;
7224 extern int memorystatus_purge_on_urgent;
7225 extern int memorystatus_purge_on_critical;
7226
7227 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_warning, 0, "");
7228 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_urgent, 0, "");
7229 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_critical, 0, "");
7230
7231
7232 #endif /* VM_PRESSURE_EVENTS */
7233
7234 /* Return both allocated and actual size, since there's a race between allocation and list compilation */
7235 static int
7236 memorystatus_get_priority_list(memorystatus_priority_entry_t **list_ptr, size_t *buffer_size, size_t *list_size, boolean_t size_only)
7237 {
7238 uint32_t list_count, i = 0;
7239 memorystatus_priority_entry_t *list_entry;
7240 proc_t p;
7241
7242 list_count = memorystatus_list_count;
7243 *list_size = sizeof(memorystatus_priority_entry_t) * list_count;
7244
7245 /* Just a size check? */
7246 if (size_only) {
7247 return 0;
7248 }
7249
7250 /* Otherwise, validate the size of the buffer */
7251 if (*buffer_size < *list_size) {
7252 return EINVAL;
7253 }
7254
7255 *list_ptr = (memorystatus_priority_entry_t*)kalloc(*list_size);
7256 if (!list_ptr) {
7257 return ENOMEM;
7258 }
7259
7260 memset(*list_ptr, 0, *list_size);
7261
7262 *buffer_size = *list_size;
7263 *list_size = 0;
7264
7265 list_entry = *list_ptr;
7266
7267 proc_list_lock();
7268
7269 p = memorystatus_get_first_proc_locked(&i, TRUE);
7270 while (p && (*list_size < *buffer_size)) {
7271 list_entry->pid = p->p_pid;
7272 list_entry->priority = p->p_memstat_effectivepriority;
7273 list_entry->user_data = p->p_memstat_userdata;
7274
7275 if (p->p_memstat_memlimit <= 0) {
7276 task_get_phys_footprint_limit(p->task, &list_entry->limit);
7277 } else {
7278 list_entry->limit = p->p_memstat_memlimit;
7279 }
7280
7281 list_entry->state = memorystatus_build_state(p);
7282 list_entry++;
7283
7284 *list_size += sizeof(memorystatus_priority_entry_t);
7285
7286 p = memorystatus_get_next_proc_locked(&i, p, TRUE);
7287 }
7288
7289 proc_list_unlock();
7290
7291 MEMORYSTATUS_DEBUG(1, "memorystatus_get_priority_list: returning %lu for size\n", (unsigned long)*list_size);
7292
7293 return 0;
7294 }
7295
7296 static int
7297 memorystatus_get_priority_pid(pid_t pid, user_addr_t buffer, size_t buffer_size) {
7298 int error = 0;
7299 memorystatus_priority_entry_t mp_entry;
7300
7301 /* Validate inputs */
7302 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_priority_entry_t))) {
7303 return EINVAL;
7304 }
7305
7306 proc_t p = proc_find(pid);
7307 if (!p) {
7308 return ESRCH;
7309 }
7310
7311 memset (&mp_entry, 0, sizeof(memorystatus_priority_entry_t));
7312
7313 mp_entry.pid = p->p_pid;
7314 mp_entry.priority = p->p_memstat_effectivepriority;
7315 mp_entry.user_data = p->p_memstat_userdata;
7316 if (p->p_memstat_memlimit <= 0) {
7317 task_get_phys_footprint_limit(p->task, &mp_entry.limit);
7318 } else {
7319 mp_entry.limit = p->p_memstat_memlimit;
7320 }
7321 mp_entry.state = memorystatus_build_state(p);
7322
7323 proc_rele(p);
7324
7325 error = copyout(&mp_entry, buffer, buffer_size);
7326
7327 return (error);
7328 }
7329
7330 static int
7331 memorystatus_cmd_get_priority_list(pid_t pid, user_addr_t buffer, size_t buffer_size, int32_t *retval) {
7332 int error = 0;
7333 boolean_t size_only;
7334 size_t list_size;
7335
7336 /*
7337 * When a non-zero pid is provided, the 'list' has only one entry.
7338 */
7339
7340 size_only = ((buffer == USER_ADDR_NULL) ? TRUE: FALSE);
7341
7342 if (pid != 0) {
7343 list_size = sizeof(memorystatus_priority_entry_t) * 1;
7344 if (!size_only) {
7345 error = memorystatus_get_priority_pid(pid, buffer, buffer_size);
7346 }
7347 } else {
7348 memorystatus_priority_entry_t *list = NULL;
7349 error = memorystatus_get_priority_list(&list, &buffer_size, &list_size, size_only);
7350
7351 if (error == 0) {
7352 if (!size_only) {
7353 error = copyout(list, buffer, list_size);
7354 }
7355 }
7356
7357 if (list) {
7358 kfree(list, buffer_size);
7359 }
7360 }
7361
7362 if (error == 0) {
7363 *retval = list_size;
7364 }
7365
7366 return (error);
7367 }
7368
7369 static void
7370 memorystatus_clear_errors(void)
7371 {
7372 proc_t p;
7373 unsigned int i = 0;
7374
7375 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_START, 0, 0, 0, 0, 0);
7376
7377 proc_list_lock();
7378
7379 p = memorystatus_get_first_proc_locked(&i, TRUE);
7380 while (p) {
7381 if (p->p_memstat_state & P_MEMSTAT_ERROR) {
7382 p->p_memstat_state &= ~P_MEMSTAT_ERROR;
7383 }
7384 p = memorystatus_get_next_proc_locked(&i, p, TRUE);
7385 }
7386
7387 proc_list_unlock();
7388
7389 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_END, 0, 0, 0, 0, 0);
7390 }
7391
7392 #if CONFIG_JETSAM
7393 static void
7394 memorystatus_update_levels_locked(boolean_t critical_only) {
7395
7396 memorystatus_available_pages_critical = memorystatus_available_pages_critical_base;
7397
7398 /*
7399 * If there's an entry in the first bucket, we have idle processes.
7400 */
7401
7402 memstat_bucket_t *first_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
7403 if (first_bucket->count) {
7404 memorystatus_available_pages_critical += memorystatus_available_pages_critical_idle_offset;
7405
7406 if (memorystatus_available_pages_critical > memorystatus_available_pages_pressure ) {
7407 /*
7408 * The critical threshold must never exceed the pressure threshold
7409 */
7410 memorystatus_available_pages_critical = memorystatus_available_pages_pressure;
7411 }
7412 }
7413
7414 #if DEBUG || DEVELOPMENT
7415 if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
7416 memorystatus_available_pages_critical += memorystatus_jetsam_policy_offset_pages_diagnostic;
7417
7418 if (memorystatus_available_pages_critical > memorystatus_available_pages_pressure ) {
7419 /*
7420 * The critical threshold must never exceed the pressure threshold
7421 */
7422 memorystatus_available_pages_critical = memorystatus_available_pages_pressure;
7423 }
7424 }
7425 #endif /* DEBUG || DEVELOPMENT */
7426
7427 if (memorystatus_jetsam_policy & kPolicyMoreFree) {
7428 memorystatus_available_pages_critical += memorystatus_policy_more_free_offset_pages;
7429 }
7430
7431 if (critical_only) {
7432 return;
7433 }
7434
7435 #if VM_PRESSURE_EVENTS
7436 memorystatus_available_pages_pressure = (pressure_threshold_percentage / delta_percentage) * memorystatus_delta;
7437 #if DEBUG || DEVELOPMENT
7438 if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
7439 memorystatus_available_pages_pressure += memorystatus_jetsam_policy_offset_pages_diagnostic;
7440 }
7441 #endif
7442 #endif
7443 }
7444
7445
7446 static int
7447 sysctl_kern_memorystatus_policy_more_free SYSCTL_HANDLER_ARGS
7448 {
7449 #pragma unused(arg1, arg2, oidp)
7450 int error = 0, more_free = 0;
7451
7452 /*
7453 * TODO: Enable this privilege check?
7454 *
7455 * error = priv_check_cred(kauth_cred_get(), PRIV_VM_JETSAM, 0);
7456 * if (error)
7457 * return (error);
7458 */
7459
7460 error = sysctl_handle_int(oidp, &more_free, 0, req);
7461 if (error || !req->newptr)
7462 return (error);
7463
7464 if ((more_free && ((memorystatus_jetsam_policy & kPolicyMoreFree) == kPolicyMoreFree)) ||
7465 (!more_free && ((memorystatus_jetsam_policy & kPolicyMoreFree) == 0))) {
7466
7467 /*
7468 * No change in state.
7469 */
7470 return 0;
7471 }
7472
7473 proc_list_lock();
7474
7475 if (more_free) {
7476 memorystatus_jetsam_policy |= kPolicyMoreFree;
7477 } else {
7478 memorystatus_jetsam_policy &= ~kPolicyMoreFree;
7479 }
7480
7481 memorystatus_update_levels_locked(TRUE);
7482
7483 proc_list_unlock();
7484
7485 return 0;
7486 }
7487 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_policy_more_free, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
7488 0, 0, &sysctl_kern_memorystatus_policy_more_free, "I", "");
7489
7490 #endif /* CONFIG_JETSAM */
7491
7492 /*
7493 * Get the at_boot snapshot
7494 */
7495 static int
7496 memorystatus_get_at_boot_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only) {
7497 size_t input_size = *snapshot_size;
7498
7499 /*
7500 * The at_boot snapshot has no entry list.
7501 */
7502 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t);
7503
7504 if (size_only) {
7505 return 0;
7506 }
7507
7508 /*
7509 * Validate the size of the snapshot buffer
7510 */
7511 if (input_size < *snapshot_size) {
7512 return EINVAL;
7513 }
7514
7515 /*
7516 * Update the notification_time only
7517 */
7518 memorystatus_at_boot_snapshot.notification_time = mach_absolute_time();
7519 *snapshot = &memorystatus_at_boot_snapshot;
7520
7521 MEMORYSTATUS_DEBUG(7, "memorystatus_get_at_boot_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%d)\n",
7522 (long)input_size, (long)*snapshot_size, 0);
7523 return 0;
7524 }
7525
7526 static int
7527 memorystatus_get_on_demand_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only) {
7528 size_t input_size = *snapshot_size;
7529 uint32_t ods_list_count = memorystatus_list_count;
7530 memorystatus_jetsam_snapshot_t *ods = NULL; /* The on_demand snapshot buffer */
7531
7532 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (ods_list_count));
7533
7534 if (size_only) {
7535 return 0;
7536 }
7537
7538 /*
7539 * Validate the size of the snapshot buffer.
7540 * This is inherently racey. May want to revisit
7541 * this error condition and trim the output when
7542 * it doesn't fit.
7543 */
7544 if (input_size < *snapshot_size) {
7545 return EINVAL;
7546 }
7547
7548 /*
7549 * Allocate and initialize a snapshot buffer.
7550 */
7551 ods = (memorystatus_jetsam_snapshot_t *)kalloc(*snapshot_size);
7552 if (!ods) {
7553 return (ENOMEM);
7554 }
7555
7556 memset(ods, 0, *snapshot_size);
7557
7558 proc_list_lock();
7559 memorystatus_init_jetsam_snapshot_locked(ods, ods_list_count);
7560 proc_list_unlock();
7561
7562 /*
7563 * Return the kernel allocated, on_demand buffer.
7564 * The caller of this routine will copy the data out
7565 * to user space and then free the kernel allocated
7566 * buffer.
7567 */
7568 *snapshot = ods;
7569
7570 MEMORYSTATUS_DEBUG(7, "memorystatus_get_on_demand_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
7571 (long)input_size, (long)*snapshot_size, (long)ods_list_count);
7572
7573 return 0;
7574 }
7575
7576 static int
7577 memorystatus_get_jetsam_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only) {
7578 size_t input_size = *snapshot_size;
7579
7580 if (memorystatus_jetsam_snapshot_count > 0) {
7581 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count));
7582 } else {
7583 *snapshot_size = 0;
7584 }
7585
7586 if (size_only) {
7587 return 0;
7588 }
7589
7590 if (input_size < *snapshot_size) {
7591 return EINVAL;
7592 }
7593
7594 *snapshot = memorystatus_jetsam_snapshot;
7595
7596 MEMORYSTATUS_DEBUG(7, "memorystatus_get_jetsam_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
7597 (long)input_size, (long)*snapshot_size, (long)memorystatus_jetsam_snapshot_count);
7598
7599 return 0;
7600 }
7601
7602
7603 static int
7604 memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t buffer_size, int32_t *retval) {
7605 int error = EINVAL;
7606 boolean_t size_only;
7607 boolean_t is_default_snapshot = FALSE;
7608 boolean_t is_on_demand_snapshot = FALSE;
7609 boolean_t is_at_boot_snapshot = FALSE;
7610 memorystatus_jetsam_snapshot_t *snapshot;
7611
7612 size_only = ((buffer == USER_ADDR_NULL) ? TRUE : FALSE);
7613
7614 if (flags == 0) {
7615 /* Default */
7616 is_default_snapshot = TRUE;
7617 error = memorystatus_get_jetsam_snapshot(&snapshot, &buffer_size, size_only);
7618 } else {
7619 if (flags & ~(MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT)) {
7620 /*
7621 * Unsupported bit set in flag.
7622 */
7623 return EINVAL;
7624 }
7625
7626 if ((flags & (MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT)) ==
7627 (MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT)) {
7628 /*
7629 * Can't have both set at the same time.
7630 */
7631 return EINVAL;
7632 }
7633
7634 if (flags & MEMORYSTATUS_SNAPSHOT_ON_DEMAND) {
7635 is_on_demand_snapshot = TRUE;
7636 /*
7637 * When not requesting the size only, the following call will allocate
7638 * an on_demand snapshot buffer, which is freed below.
7639 */
7640 error = memorystatus_get_on_demand_snapshot(&snapshot, &buffer_size, size_only);
7641
7642 } else if (flags & MEMORYSTATUS_SNAPSHOT_AT_BOOT) {
7643 is_at_boot_snapshot = TRUE;
7644 error = memorystatus_get_at_boot_snapshot(&snapshot, &buffer_size, size_only);
7645 } else {
7646 /*
7647 * Invalid flag setting.
7648 */
7649 return EINVAL;
7650 }
7651 }
7652
7653 if (error) {
7654 goto out;
7655 }
7656
7657 /*
7658 * Copy the data out to user space and clear the snapshot buffer.
7659 * If working with the jetsam snapshot,
7660 * clearing the buffer means, reset the count.
7661 * If working with an on_demand snapshot
7662 * clearing the buffer means, free it.
7663 * If working with the at_boot snapshot
7664 * there is nothing to clear or update.
7665 */
7666 if (!size_only) {
7667 if ((error = copyout(snapshot, buffer, buffer_size)) == 0) {
7668 if (is_default_snapshot) {
7669 /*
7670 * The jetsam snapshot is never freed, its count is simply reset.
7671 */
7672 proc_list_lock();
7673 snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
7674 memorystatus_jetsam_snapshot_last_timestamp = 0;
7675 proc_list_unlock();
7676 }
7677 }
7678
7679 if (is_on_demand_snapshot) {
7680 /*
7681 * The on_demand snapshot is always freed,
7682 * even if the copyout failed.
7683 */
7684 if(snapshot) {
7685 kfree(snapshot, buffer_size);
7686 }
7687 }
7688 }
7689
7690 if (error == 0) {
7691 *retval = buffer_size;
7692 }
7693 out:
7694 return error;
7695 }
7696
7697 /*
7698 * Routine: memorystatus_cmd_grp_set_properties
7699 * Purpose: Update properties for a group of processes.
7700 *
7701 * Supported Properties:
7702 * [priority]
7703 * Move each process out of its effective priority
7704 * band and into a new priority band.
7705 * Maintains relative order from lowest to highest priority.
7706 * In single band, maintains relative order from head to tail.
7707 *
7708 * eg: before [effectivepriority | pid]
7709 * [18 | p101 ]
7710 * [17 | p55, p67, p19 ]
7711 * [12 | p103 p10 ]
7712 * [ 7 | p25 ]
7713 * [ 0 | p71, p82, ]
7714 *
7715 * after [ new band | pid]
7716 * [ xxx | p71, p82, p25, p103, p10, p55, p67, p19, p101]
7717 *
7718 * Returns: 0 on success, else non-zero.
7719 *
7720 * Caveat: We know there is a race window regarding recycled pids.
7721 * A process could be killed before the kernel can act on it here.
7722 * If a pid cannot be found in any of the jetsam priority bands,
7723 * then we simply ignore it. No harm.
7724 * But, if the pid has been recycled then it could be an issue.
7725 * In that scenario, we might move an unsuspecting process to the new
7726 * priority band. It's not clear how the kernel can safeguard
7727 * against this, but it would be an extremely rare case anyway.
7728 * The caller of this api might avoid such race conditions by
7729 * ensuring that the processes passed in the pid list are suspended.
7730 */
7731
7732
7733 /* This internal structure can expand when we add support for more properties */
7734 typedef struct memorystatus_internal_properties
7735 {
7736 proc_t proc;
7737 int32_t priority; /* see memorytstatus_priority_entry_t : priority */
7738 } memorystatus_internal_properties_t;
7739
7740
7741 static int
7742 memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
7743
7744 #pragma unused (flags)
7745
7746 /*
7747 * We only handle setting priority
7748 * per process
7749 */
7750
7751 int error = 0;
7752 memorystatus_priority_entry_t *entries = NULL;
7753 uint32_t entry_count = 0;
7754
7755 /* This will be the ordered proc list */
7756 memorystatus_internal_properties_t *table = NULL;
7757 size_t table_size = 0;
7758 uint32_t table_count = 0;
7759
7760 uint32_t i = 0;
7761 uint32_t bucket_index = 0;
7762 boolean_t head_insert;
7763 int32_t new_priority;
7764
7765 proc_t p;
7766
7767 /* Verify inputs */
7768 if ((buffer == USER_ADDR_NULL) || (buffer_size == 0) || ((buffer_size % sizeof(memorystatus_priority_entry_t)) != 0)) {
7769 error = EINVAL;
7770 goto out;
7771 }
7772
7773 entry_count = (buffer_size / sizeof(memorystatus_priority_entry_t));
7774 if ((entries = (memorystatus_priority_entry_t *)kalloc(buffer_size)) == NULL) {
7775 error = ENOMEM;
7776 goto out;
7777 }
7778
7779 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, entry_count, 0, 0, 0, 0);
7780
7781 if ((error = copyin(buffer, entries, buffer_size)) != 0) {
7782 goto out;
7783 }
7784
7785 /* Verify sanity of input priorities */
7786 for (i=0; i < entry_count; i++) {
7787 if (entries[i].priority == -1) {
7788 /* Use as shorthand for default priority */
7789 entries[i].priority = JETSAM_PRIORITY_DEFAULT;
7790 } else if ((entries[i].priority == system_procs_aging_band) || (entries[i].priority == applications_aging_band)) {
7791 /* Both the aging bands are reserved for internal use;
7792 * if requested, adjust to JETSAM_PRIORITY_IDLE. */
7793 entries[i].priority = JETSAM_PRIORITY_IDLE;
7794 } else if (entries[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
7795 /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle
7796 * queue */
7797 /* Deal with this later */
7798 } else if ((entries[i].priority < 0) || (entries[i].priority >= MEMSTAT_BUCKET_COUNT)) {
7799 /* Sanity check */
7800 error = EINVAL;
7801 goto out;
7802 }
7803 }
7804
7805 table_size = sizeof(memorystatus_internal_properties_t) * entry_count;
7806 if ( (table = (memorystatus_internal_properties_t *)kalloc(table_size)) == NULL) {
7807 error = ENOMEM;
7808 goto out;
7809 }
7810 memset(table, 0, table_size);
7811
7812
7813 /*
7814 * For each jetsam bucket entry, spin through the input property list.
7815 * When a matching pid is found, populate an adjacent table with the
7816 * appropriate proc pointer and new property values.
7817 * This traversal automatically preserves order from lowest
7818 * to highest priority.
7819 */
7820
7821 bucket_index=0;
7822
7823 proc_list_lock();
7824
7825 /* Create the ordered table */
7826 p = memorystatus_get_first_proc_locked(&bucket_index, TRUE);
7827 while (p && (table_count < entry_count)) {
7828 for (i=0; i < entry_count; i++ ) {
7829 if (p->p_pid == entries[i].pid) {
7830 /* Build the table data */
7831 table[table_count].proc = p;
7832 table[table_count].priority = entries[i].priority;
7833 table_count++;
7834 break;
7835 }
7836 }
7837 p = memorystatus_get_next_proc_locked(&bucket_index, p, TRUE);
7838 }
7839
7840 /* We now have ordered list of procs ready to move */
7841 for (i=0; i < table_count; i++) {
7842 p = table[i].proc;
7843 assert(p != NULL);
7844
7845 /* Allow head inserts -- but relative order is now */
7846 if (table[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
7847 new_priority = JETSAM_PRIORITY_IDLE;
7848 head_insert = true;
7849 } else {
7850 new_priority = table[i].priority;
7851 head_insert = false;
7852 }
7853
7854 /* Not allowed */
7855 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
7856 continue;
7857 }
7858
7859 /*
7860 * Take appropriate steps if moving proc out of
7861 * either of the aging bands.
7862 */
7863 if ((p->p_memstat_effectivepriority == system_procs_aging_band) || (p->p_memstat_effectivepriority == applications_aging_band)) {
7864 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
7865 }
7866
7867 memorystatus_update_priority_locked(p, new_priority, head_insert, false);
7868 }
7869
7870 proc_list_unlock();
7871
7872 /*
7873 * if (table_count != entry_count)
7874 * then some pids were not found in a jetsam band.
7875 * harmless but interesting...
7876 */
7877 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, entry_count, table_count, 0, 0, 0);
7878
7879 out:
7880 if (entries)
7881 kfree(entries, buffer_size);
7882 if (table)
7883 kfree(table, table_size);
7884
7885 return (error);
7886 }
7887
7888
7889 /*
7890 * This routine is used to update a process's jetsam priority position and stored user_data.
7891 * It is not used for the setting of memory limits, which is why the last 6 args to the
7892 * memorystatus_update() call are 0 or FALSE.
7893 */
7894
7895 static int
7896 memorystatus_cmd_set_priority_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
7897 int error = 0;
7898 memorystatus_priority_properties_t mpp_entry;
7899
7900 /* Validate inputs */
7901 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_priority_properties_t))) {
7902 return EINVAL;
7903 }
7904
7905 error = copyin(buffer, &mpp_entry, buffer_size);
7906
7907 if (error == 0) {
7908 proc_t p;
7909
7910 p = proc_find(pid);
7911 if (!p) {
7912 return ESRCH;
7913 }
7914
7915 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
7916 proc_rele(p);
7917 return EPERM;
7918 }
7919
7920 error = memorystatus_update(p, mpp_entry.priority, mpp_entry.user_data, FALSE, FALSE, 0, 0, FALSE, FALSE);
7921 proc_rele(p);
7922 }
7923
7924 return(error);
7925 }
7926
7927 static int
7928 memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
7929 int error = 0;
7930 memorystatus_memlimit_properties_t mmp_entry;
7931
7932 /* Validate inputs */
7933 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_memlimit_properties_t))) {
7934 return EINVAL;
7935 }
7936
7937 error = copyin(buffer, &mmp_entry, buffer_size);
7938
7939 if (error == 0) {
7940 error = memorystatus_set_memlimit_properties(pid, &mmp_entry);
7941 }
7942
7943 return(error);
7944 }
7945
7946 /*
7947 * When getting the memlimit settings, we can't simply call task_get_phys_footprint_limit().
7948 * That gets the proc's cached memlimit and there is no guarantee that the active/inactive
7949 * limits will be the same in the no-limit case. Instead we convert limits <= 0 using
7950 * task_convert_phys_footprint_limit(). It computes the same limit value that would be written
7951 * to the task's ledgers via task_set_phys_footprint_limit().
7952 */
7953 static int
7954 memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
7955 int error = 0;
7956 memorystatus_memlimit_properties_t mmp_entry;
7957
7958 /* Validate inputs */
7959 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_memlimit_properties_t))) {
7960 return EINVAL;
7961 }
7962
7963 memset (&mmp_entry, 0, sizeof(memorystatus_memlimit_properties_t));
7964
7965 proc_t p = proc_find(pid);
7966 if (!p) {
7967 return ESRCH;
7968 }
7969
7970 /*
7971 * Get the active limit and attributes.
7972 * No locks taken since we hold a reference to the proc.
7973 */
7974
7975 if (p->p_memstat_memlimit_active > 0 ) {
7976 mmp_entry.memlimit_active = p->p_memstat_memlimit_active;
7977 } else {
7978 task_convert_phys_footprint_limit(-1, &mmp_entry.memlimit_active);
7979 }
7980
7981 if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL) {
7982 mmp_entry.memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7983 }
7984
7985 /*
7986 * Get the inactive limit and attributes
7987 */
7988 if (p->p_memstat_memlimit_inactive <= 0) {
7989 task_convert_phys_footprint_limit(-1, &mmp_entry.memlimit_inactive);
7990 } else {
7991 mmp_entry.memlimit_inactive = p->p_memstat_memlimit_inactive;
7992 }
7993 if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) {
7994 mmp_entry.memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7995 }
7996 proc_rele(p);
7997
7998 error = copyout(&mmp_entry, buffer, buffer_size);
7999
8000 return(error);
8001 }
8002
8003
8004 /*
8005 * SPI for kbd - pr24956468
8006 * This is a very simple snapshot that calculates how much a
8007 * process's phys_footprint exceeds a specific memory limit.
8008 * Only the inactive memory limit is supported for now.
8009 * The delta is returned as bytes in excess or zero.
8010 */
8011 static int
8012 memorystatus_cmd_get_memlimit_excess_np(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
8013 int error = 0;
8014 uint64_t footprint_in_bytes = 0;
8015 uint64_t delta_in_bytes = 0;
8016 int32_t memlimit_mb = 0;
8017 uint64_t memlimit_bytes = 0;
8018
8019 /* Validate inputs */
8020 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(uint64_t)) || (flags != 0)) {
8021 return EINVAL;
8022 }
8023
8024 proc_t p = proc_find(pid);
8025 if (!p) {
8026 return ESRCH;
8027 }
8028
8029 /*
8030 * Get the inactive limit.
8031 * No locks taken since we hold a reference to the proc.
8032 */
8033
8034 if (p->p_memstat_memlimit_inactive <= 0) {
8035 task_convert_phys_footprint_limit(-1, &memlimit_mb);
8036 } else {
8037 memlimit_mb = p->p_memstat_memlimit_inactive;
8038 }
8039
8040 footprint_in_bytes = get_task_phys_footprint(p->task);
8041
8042 proc_rele(p);
8043
8044 memlimit_bytes = memlimit_mb * 1024 * 1024; /* MB to bytes */
8045
8046 /*
8047 * Computed delta always returns >= 0 bytes
8048 */
8049 if (footprint_in_bytes > memlimit_bytes) {
8050 delta_in_bytes = footprint_in_bytes - memlimit_bytes;
8051 }
8052
8053 error = copyout(&delta_in_bytes, buffer, sizeof(delta_in_bytes));
8054
8055 return(error);
8056 }
8057
8058
8059 static int
8060 memorystatus_cmd_get_pressure_status(int32_t *retval) {
8061 int error;
8062
8063 /* Need privilege for check */
8064 error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
8065 if (error) {
8066 return (error);
8067 }
8068
8069 /* Inherently racy, so it's not worth taking a lock here */
8070 *retval = (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
8071
8072 return error;
8073 }
8074
8075 int
8076 memorystatus_get_pressure_status_kdp() {
8077 return (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
8078 }
8079
8080 /*
8081 * Every process, including a P_MEMSTAT_INTERNAL process (currently only pid 1), is allowed to set a HWM.
8082 *
8083 * This call is inflexible -- it does not distinguish between active/inactive, fatal/non-fatal
8084 * So, with 2-level HWM preserving previous behavior will map as follows.
8085 * - treat the limit passed in as both an active and inactive limit.
8086 * - treat the is_fatal_limit flag as though it applies to both active and inactive limits.
8087 *
8088 * When invoked via MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK
8089 * - the is_fatal_limit is FALSE, meaning the active and inactive limits are non-fatal/soft
8090 * - so mapping is (active/non-fatal, inactive/non-fatal)
8091 *
8092 * When invoked via MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT
8093 * - the is_fatal_limit is TRUE, meaning the process's active and inactive limits are fatal/hard
8094 * - so mapping is (active/fatal, inactive/fatal)
8095 */
8096
8097 #if CONFIG_JETSAM
8098 static int
8099 memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit) {
8100 int error = 0;
8101 memorystatus_memlimit_properties_t entry;
8102
8103 entry.memlimit_active = high_water_mark;
8104 entry.memlimit_active_attr = 0;
8105 entry.memlimit_inactive = high_water_mark;
8106 entry.memlimit_inactive_attr = 0;
8107
8108 if (is_fatal_limit == TRUE) {
8109 entry.memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
8110 entry.memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
8111 }
8112
8113 error = memorystatus_set_memlimit_properties(pid, &entry);
8114 return (error);
8115 }
8116 #endif /* CONFIG_JETSAM */
8117
8118 static int
8119 memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry) {
8120
8121 int32_t memlimit_active;
8122 boolean_t memlimit_active_is_fatal;
8123 int32_t memlimit_inactive;
8124 boolean_t memlimit_inactive_is_fatal;
8125 uint32_t valid_attrs = 0;
8126 int error = 0;
8127
8128 proc_t p = proc_find(pid);
8129 if (!p) {
8130 return ESRCH;
8131 }
8132
8133 /*
8134 * Check for valid attribute flags.
8135 */
8136 valid_attrs |= (MEMORYSTATUS_MEMLIMIT_ATTR_FATAL);
8137 if ((entry->memlimit_active_attr & (~valid_attrs)) != 0) {
8138 proc_rele(p);
8139 return EINVAL;
8140 }
8141 if ((entry->memlimit_inactive_attr & (~valid_attrs)) != 0) {
8142 proc_rele(p);
8143 return EINVAL;
8144 }
8145
8146 /*
8147 * Setup the active memlimit properties
8148 */
8149 memlimit_active = entry->memlimit_active;
8150 if (entry->memlimit_active_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL) {
8151 memlimit_active_is_fatal = TRUE;
8152 } else {
8153 memlimit_active_is_fatal = FALSE;
8154 }
8155
8156 /*
8157 * Setup the inactive memlimit properties
8158 */
8159 memlimit_inactive = entry->memlimit_inactive;
8160 if (entry->memlimit_inactive_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL) {
8161 memlimit_inactive_is_fatal = TRUE;
8162 } else {
8163 memlimit_inactive_is_fatal = FALSE;
8164 }
8165
8166 /*
8167 * Setting a limit of <= 0 implies that the process has no
8168 * high-water-mark and has no per-task-limit. That means
8169 * the system_wide task limit is in place, which by the way,
8170 * is always fatal.
8171 */
8172
8173 if (memlimit_active <= 0) {
8174 /*
8175 * Enforce the fatal system_wide task limit while process is active.
8176 */
8177 memlimit_active = -1;
8178 memlimit_active_is_fatal = TRUE;
8179 }
8180
8181 if (memlimit_inactive <= 0) {
8182 /*
8183 * Enforce the fatal system_wide task limit while process is inactive.
8184 */
8185 memlimit_inactive = -1;
8186 memlimit_inactive_is_fatal = TRUE;
8187 }
8188
8189 proc_list_lock();
8190
8191 /*
8192 * Store the active limit variants in the proc.
8193 */
8194 SET_ACTIVE_LIMITS_LOCKED(p, memlimit_active, memlimit_active_is_fatal);
8195
8196 /*
8197 * Store the inactive limit variants in the proc.
8198 */
8199 SET_INACTIVE_LIMITS_LOCKED(p, memlimit_inactive, memlimit_inactive_is_fatal);
8200
8201 /*
8202 * Enforce appropriate limit variant by updating the cached values
8203 * and writing the ledger.
8204 * Limit choice is based on process active/inactive state.
8205 */
8206
8207 if (memorystatus_highwater_enabled) {
8208 boolean_t is_fatal;
8209 boolean_t use_active;
8210
8211 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
8212 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
8213 use_active = TRUE;
8214 } else {
8215 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
8216 use_active = FALSE;
8217 }
8218
8219 /* Enforce the limit by writing to the ledgers */
8220 error = (task_set_phys_footprint_limit_internal(p->task, ((p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1), NULL, use_active, is_fatal) == 0) ? 0 : EINVAL;
8221
8222 MEMORYSTATUS_DEBUG(3, "memorystatus_set_memlimit_properties: new limit on pid %d (%dMB %s) current priority (%d) dirty_state?=0x%x %s\n",
8223 p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
8224 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, p->p_memstat_dirty,
8225 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
8226 DTRACE_MEMORYSTATUS2(memorystatus_set_memlimit, proc_t, p, int32_t, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1));
8227 }
8228
8229 proc_list_unlock();
8230 proc_rele(p);
8231
8232 return error;
8233 }
8234
8235 /*
8236 * Returns the jetsam priority (effective or requested) of the process
8237 * associated with this task.
8238 */
8239 int
8240 proc_get_memstat_priority(proc_t p, boolean_t effective_priority)
8241 {
8242 if (p) {
8243 if (effective_priority) {
8244 return p->p_memstat_effectivepriority;
8245 } else {
8246 return p->p_memstat_requestedpriority;
8247 }
8248 }
8249 return 0;
8250 }
8251
8252 int
8253 memorystatus_control(struct proc *p __unused, struct memorystatus_control_args *args, int *ret) {
8254 int error = EINVAL;
8255 os_reason_t jetsam_reason = OS_REASON_NULL;
8256
8257 #if !CONFIG_JETSAM
8258 #pragma unused(ret)
8259 #pragma unused(jetsam_reason)
8260 #endif
8261
8262 /* Need to be root or have entitlement */
8263 if (!kauth_cred_issuser(kauth_cred_get()) && !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT)) {
8264 error = EPERM;
8265 goto out;
8266 }
8267
8268 /*
8269 * Sanity check.
8270 * Do not enforce it for snapshots.
8271 */
8272 if (args->command != MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT) {
8273 if (args->buffersize > MEMORYSTATUS_BUFFERSIZE_MAX) {
8274 error = EINVAL;
8275 goto out;
8276 }
8277 }
8278
8279 switch (args->command) {
8280 case MEMORYSTATUS_CMD_GET_PRIORITY_LIST:
8281 error = memorystatus_cmd_get_priority_list(args->pid, args->buffer, args->buffersize, ret);
8282 break;
8283 case MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES:
8284 error = memorystatus_cmd_set_priority_properties(args->pid, args->buffer, args->buffersize, ret);
8285 break;
8286 case MEMORYSTATUS_CMD_SET_MEMLIMIT_PROPERTIES:
8287 error = memorystatus_cmd_set_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
8288 break;
8289 case MEMORYSTATUS_CMD_GET_MEMLIMIT_PROPERTIES:
8290 error = memorystatus_cmd_get_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
8291 break;
8292 case MEMORYSTATUS_CMD_GET_MEMLIMIT_EXCESS:
8293 error = memorystatus_cmd_get_memlimit_excess_np(args->pid, args->flags, args->buffer, args->buffersize, ret);
8294 break;
8295 case MEMORYSTATUS_CMD_GRP_SET_PROPERTIES:
8296 error = memorystatus_cmd_grp_set_properties((int32_t)args->flags, args->buffer, args->buffersize, ret);
8297 break;
8298 case MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT:
8299 error = memorystatus_cmd_get_jetsam_snapshot((int32_t)args->flags, args->buffer, args->buffersize, ret);
8300 break;
8301 case MEMORYSTATUS_CMD_GET_PRESSURE_STATUS:
8302 error = memorystatus_cmd_get_pressure_status(ret);
8303 break;
8304 #if CONFIG_JETSAM
8305 case MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK:
8306 /*
8307 * This call does not distinguish between active and inactive limits.
8308 * Default behavior in 2-level HWM world is to set both.
8309 * Non-fatal limit is also assumed for both.
8310 */
8311 error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, FALSE);
8312 break;
8313 case MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT:
8314 /*
8315 * This call does not distinguish between active and inactive limits.
8316 * Default behavior in 2-level HWM world is to set both.
8317 * Fatal limit is also assumed for both.
8318 */
8319 error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, TRUE);
8320 break;
8321 #endif /* CONFIG_JETSAM */
8322 /* Test commands */
8323 #if DEVELOPMENT || DEBUG
8324 case MEMORYSTATUS_CMD_TEST_JETSAM:
8325 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_GENERIC);
8326 if (jetsam_reason == OS_REASON_NULL) {
8327 printf("memorystatus_control: failed to allocate jetsam reason\n");
8328 }
8329
8330 error = memorystatus_kill_process_sync(args->pid, kMemorystatusKilled, jetsam_reason) ? 0 : EINVAL;
8331 break;
8332 case MEMORYSTATUS_CMD_TEST_JETSAM_SORT:
8333 error = memorystatus_cmd_test_jetsam_sort(args->pid, (int32_t)args->flags);
8334 break;
8335 #if CONFIG_JETSAM
8336 case MEMORYSTATUS_CMD_SET_JETSAM_PANIC_BITS:
8337 error = memorystatus_cmd_set_panic_bits(args->buffer, args->buffersize);
8338 break;
8339 #endif /* CONFIG_JETSAM */
8340 #else /* DEVELOPMENT || DEBUG */
8341 #pragma unused(jetsam_reason)
8342 #endif /* DEVELOPMENT || DEBUG */
8343 case MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_ENABLE:
8344 if (memorystatus_aggressive_jetsam_lenient_allowed == FALSE) {
8345 #if DEVELOPMENT || DEBUG
8346 printf("Enabling Lenient Mode\n");
8347 #endif /* DEVELOPMENT || DEBUG */
8348
8349 memorystatus_aggressive_jetsam_lenient_allowed = TRUE;
8350 memorystatus_aggressive_jetsam_lenient = TRUE;
8351 error = 0;
8352 }
8353 break;
8354 case MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_DISABLE:
8355 #if DEVELOPMENT || DEBUG
8356 printf("Disabling Lenient mode\n");
8357 #endif /* DEVELOPMENT || DEBUG */
8358 memorystatus_aggressive_jetsam_lenient_allowed = FALSE;
8359 memorystatus_aggressive_jetsam_lenient = FALSE;
8360 error = 0;
8361 break;
8362 case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE:
8363 case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE:
8364 error = memorystatus_low_mem_privileged_listener(args->command);
8365 break;
8366
8367 case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE:
8368 case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE:
8369 error = memorystatus_update_inactive_jetsam_priority_band(args->pid, args->command, args->flags ? TRUE : FALSE);
8370 break;
8371
8372 default:
8373 break;
8374 }
8375
8376 out:
8377 return error;
8378 }
8379
8380
8381 static int
8382 filt_memorystatusattach(struct knote *kn, __unused struct kevent_internal_s *kev)
8383 {
8384 int error;
8385
8386 kn->kn_flags |= EV_CLEAR;
8387 error = memorystatus_knote_register(kn);
8388 if (error) {
8389 kn->kn_flags = EV_ERROR;
8390 kn->kn_data = error;
8391 }
8392 return 0;
8393 }
8394
8395 static void
8396 filt_memorystatusdetach(struct knote *kn)
8397 {
8398 memorystatus_knote_unregister(kn);
8399 }
8400
8401 static int
8402 filt_memorystatus(struct knote *kn __unused, long hint)
8403 {
8404 if (hint) {
8405 switch (hint) {
8406 case kMemorystatusNoPressure:
8407 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
8408 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
8409 }
8410 break;
8411 case kMemorystatusPressure:
8412 if (memorystatus_vm_pressure_level == kVMPressureWarning || memorystatus_vm_pressure_level == kVMPressureUrgent) {
8413 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
8414 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
8415 }
8416 } else if (memorystatus_vm_pressure_level == kVMPressureCritical) {
8417
8418 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
8419 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
8420 }
8421 }
8422 break;
8423 case kMemorystatusLowSwap:
8424 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) {
8425 kn->kn_fflags = NOTE_MEMORYSTATUS_LOW_SWAP;
8426 }
8427 break;
8428
8429 case kMemorystatusProcLimitWarn:
8430 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
8431 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
8432 }
8433 break;
8434
8435 case kMemorystatusProcLimitCritical:
8436 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
8437 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
8438 }
8439 break;
8440
8441 default:
8442 break;
8443 }
8444 }
8445
8446 #if 0
8447 if (kn->kn_fflags != 0) {
8448 proc_t knote_proc = knote_get_kq(kn)->kq_p;
8449 pid_t knote_pid = knote_proc->p_pid;
8450
8451 printf("filt_memorystatus: sending kn 0x%lx (event 0x%x) for pid (%d)\n",
8452 (unsigned long)kn, kn->kn_fflags, knote_pid);
8453 }
8454 #endif
8455
8456 return (kn->kn_fflags != 0);
8457 }
8458
8459 static int
8460 filt_memorystatustouch(struct knote *kn, struct kevent_internal_s *kev)
8461 {
8462 int res;
8463 int prev_kn_sfflags = 0;
8464
8465 memorystatus_klist_lock();
8466
8467 /*
8468 * copy in new kevent settings
8469 * (saving the "desired" data and fflags).
8470 */
8471
8472 prev_kn_sfflags = kn->kn_sfflags;
8473 kn->kn_sfflags = (kev->fflags & EVFILT_MEMORYSTATUS_ALL_MASK);
8474
8475 #if !CONFIG_EMBEDDED
8476 /*
8477 * Only on desktop do we restrict notifications to
8478 * one per active/inactive state (soft limits only).
8479 */
8480 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
8481 /*
8482 * Is there previous state to preserve?
8483 */
8484 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
8485 /*
8486 * This knote was previously interested in proc_limit_warn,
8487 * so yes, preserve previous state.
8488 */
8489 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
8490 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
8491 }
8492 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
8493 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
8494 }
8495 } else {
8496 /*
8497 * This knote was not previously interested in proc_limit_warn,
8498 * but it is now. Set both states.
8499 */
8500 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
8501 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
8502 }
8503 }
8504
8505 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
8506 /*
8507 * Is there previous state to preserve?
8508 */
8509 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
8510 /*
8511 * This knote was previously interested in proc_limit_critical,
8512 * so yes, preserve previous state.
8513 */
8514 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
8515 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
8516 }
8517 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
8518 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
8519 }
8520 } else {
8521 /*
8522 * This knote was not previously interested in proc_limit_critical,
8523 * but it is now. Set both states.
8524 */
8525 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
8526 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
8527 }
8528 }
8529 #endif /* !CONFIG_EMBEDDED */
8530
8531 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
8532 kn->kn_udata = kev->udata;
8533
8534 /*
8535 * reset the output flags based on a
8536 * combination of the old events and
8537 * the new desired event list.
8538 */
8539 //kn->kn_fflags &= kn->kn_sfflags;
8540
8541 res = (kn->kn_fflags != 0);
8542
8543 memorystatus_klist_unlock();
8544
8545 return res;
8546 }
8547
8548 static int
8549 filt_memorystatusprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
8550 {
8551 #pragma unused(data)
8552 int res;
8553
8554 memorystatus_klist_lock();
8555 res = (kn->kn_fflags != 0);
8556 if (res) {
8557 *kev = kn->kn_kevent;
8558 kn->kn_flags |= EV_CLEAR; /* automatic */
8559 kn->kn_fflags = 0;
8560 kn->kn_data = 0;
8561 }
8562 memorystatus_klist_unlock();
8563
8564 return res;
8565 }
8566
8567 static void
8568 memorystatus_klist_lock(void) {
8569 lck_mtx_lock(&memorystatus_klist_mutex);
8570 }
8571
8572 static void
8573 memorystatus_klist_unlock(void) {
8574 lck_mtx_unlock(&memorystatus_klist_mutex);
8575 }
8576
8577 void
8578 memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr) {
8579 lck_mtx_init(&memorystatus_klist_mutex, grp, attr);
8580 klist_init(&memorystatus_klist);
8581 }
8582
8583 int
8584 memorystatus_knote_register(struct knote *kn) {
8585 int error = 0;
8586
8587 memorystatus_klist_lock();
8588
8589 /*
8590 * Support only userspace visible flags.
8591 */
8592 if ((kn->kn_sfflags & EVFILT_MEMORYSTATUS_ALL_MASK) == (unsigned int) kn->kn_sfflags) {
8593
8594 #if !CONFIG_EMBEDDED
8595 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
8596 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
8597 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
8598 }
8599
8600 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
8601 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
8602 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
8603 }
8604 #endif /* !CONFIG_EMBEDDED */
8605
8606 KNOTE_ATTACH(&memorystatus_klist, kn);
8607
8608 } else {
8609 error = ENOTSUP;
8610 }
8611
8612 memorystatus_klist_unlock();
8613
8614 return error;
8615 }
8616
8617 void
8618 memorystatus_knote_unregister(struct knote *kn __unused) {
8619 memorystatus_klist_lock();
8620 KNOTE_DETACH(&memorystatus_klist, kn);
8621 memorystatus_klist_unlock();
8622 }
8623
8624
8625 #if 0
8626 #if CONFIG_JETSAM && VM_PRESSURE_EVENTS
8627 static boolean_t
8628 memorystatus_issue_pressure_kevent(boolean_t pressured) {
8629 memorystatus_klist_lock();
8630 KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure);
8631 memorystatus_klist_unlock();
8632 return TRUE;
8633 }
8634 #endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
8635 #endif /* 0 */
8636
8637 /* Coalition support */
8638
8639 /* sorting info for a particular priority bucket */
8640 typedef struct memstat_sort_info {
8641 coalition_t msi_coal;
8642 uint64_t msi_page_count;
8643 pid_t msi_pid;
8644 int msi_ntasks;
8645 } memstat_sort_info_t;
8646
8647 /*
8648 * qsort from smallest page count to largest page count
8649 *
8650 * return < 0 for a < b
8651 * 0 for a == b
8652 * > 0 for a > b
8653 */
8654 static int memstat_asc_cmp(const void *a, const void *b)
8655 {
8656 const memstat_sort_info_t *msA = (const memstat_sort_info_t *)a;
8657 const memstat_sort_info_t *msB = (const memstat_sort_info_t *)b;
8658
8659 return (int)((uint64_t)msA->msi_page_count - (uint64_t)msB->msi_page_count);
8660 }
8661
8662 /*
8663 * Return the number of pids rearranged during this sort.
8664 */
8665 static int
8666 memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coal_sort_order)
8667 {
8668 #define MAX_SORT_PIDS 80
8669 #define MAX_COAL_LEADERS 10
8670
8671 unsigned int b = bucket_index;
8672 int nleaders = 0;
8673 int ntasks = 0;
8674 proc_t p = NULL;
8675 coalition_t coal = COALITION_NULL;
8676 int pids_moved = 0;
8677 int total_pids_moved = 0;
8678 int i;
8679
8680 /*
8681 * The system is typically under memory pressure when in this
8682 * path, hence, we want to avoid dynamic memory allocation.
8683 */
8684 memstat_sort_info_t leaders[MAX_COAL_LEADERS];
8685 pid_t pid_list[MAX_SORT_PIDS];
8686
8687 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
8688 return(0);
8689 }
8690
8691 /*
8692 * Clear the array that holds coalition leader information
8693 */
8694 for (i=0; i < MAX_COAL_LEADERS; i++) {
8695 leaders[i].msi_coal = COALITION_NULL;
8696 leaders[i].msi_page_count = 0; /* will hold total coalition page count */
8697 leaders[i].msi_pid = 0; /* will hold coalition leader pid */
8698 leaders[i].msi_ntasks = 0; /* will hold the number of tasks in a coalition */
8699 }
8700
8701 p = memorystatus_get_first_proc_locked(&b, FALSE);
8702 while (p) {
8703 if (coalition_is_leader(p->task, COALITION_TYPE_JETSAM, &coal)) {
8704 if (nleaders < MAX_COAL_LEADERS) {
8705 int coal_ntasks = 0;
8706 uint64_t coal_page_count = coalition_get_page_count(coal, &coal_ntasks);
8707 leaders[nleaders].msi_coal = coal;
8708 leaders[nleaders].msi_page_count = coal_page_count;
8709 leaders[nleaders].msi_pid = p->p_pid; /* the coalition leader */
8710 leaders[nleaders].msi_ntasks = coal_ntasks;
8711 nleaders++;
8712 } else {
8713 /*
8714 * We've hit MAX_COAL_LEADERS meaning we can handle no more coalitions.
8715 * Abandoned coalitions will linger at the tail of the priority band
8716 * when this sort session ends.
8717 * TODO: should this be an assert?
8718 */
8719 printf("%s: WARNING: more than %d leaders in priority band [%d]\n",
8720 __FUNCTION__, MAX_COAL_LEADERS, bucket_index);
8721 break;
8722 }
8723 }
8724 p=memorystatus_get_next_proc_locked(&b, p, FALSE);
8725 }
8726
8727 if (nleaders == 0) {
8728 /* Nothing to sort */
8729 return(0);
8730 }
8731
8732 /*
8733 * Sort the coalition leader array, from smallest coalition page count
8734 * to largest coalition page count. When inserted in the priority bucket,
8735 * smallest coalition is handled first, resulting in the last to be jetsammed.
8736 */
8737 if (nleaders > 1) {
8738 qsort(leaders, nleaders, sizeof(memstat_sort_info_t), memstat_asc_cmp);
8739 }
8740
8741 #if 0
8742 for (i = 0; i < nleaders; i++) {
8743 printf("%s: coal_leader[%d of %d] pid[%d] pages[%llu] ntasks[%d]\n",
8744 __FUNCTION__, i, nleaders, leaders[i].msi_pid, leaders[i].msi_page_count,
8745 leaders[i].msi_ntasks);
8746 }
8747 #endif
8748
8749 /*
8750 * During coalition sorting, processes in a priority band are rearranged
8751 * by being re-inserted at the head of the queue. So, when handling a
8752 * list, the first process that gets moved to the head of the queue,
8753 * ultimately gets pushed toward the queue tail, and hence, jetsams last.
8754 *
8755 * So, for example, the coalition leader is expected to jetsam last,
8756 * after its coalition members. Therefore, the coalition leader is
8757 * inserted at the head of the queue first.
8758 *
8759 * After processing a coalition, the jetsam order is as follows:
8760 * undefs(jetsam first), extensions, xpc services, leader(jetsam last)
8761 */
8762
8763 /*
8764 * Coalition members are rearranged in the priority bucket here,
8765 * based on their coalition role.
8766 */
8767 total_pids_moved = 0;
8768 for (i=0; i < nleaders; i++) {
8769
8770 /* a bit of bookkeeping */
8771 pids_moved = 0;
8772
8773 /* Coalition leaders are jetsammed last, so move into place first */
8774 pid_list[0] = leaders[i].msi_pid;
8775 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list, 1);
8776
8777 /* xpc services should jetsam after extensions */
8778 ntasks = coalition_get_pid_list (leaders[i].msi_coal, COALITION_ROLEMASK_XPC,
8779 coal_sort_order, pid_list, MAX_SORT_PIDS);
8780
8781 if (ntasks > 0) {
8782 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
8783 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
8784 }
8785
8786 /* extensions should jetsam after unmarked processes */
8787 ntasks = coalition_get_pid_list (leaders[i].msi_coal, COALITION_ROLEMASK_EXT,
8788 coal_sort_order, pid_list, MAX_SORT_PIDS);
8789
8790 if (ntasks > 0) {
8791 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
8792 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
8793 }
8794
8795 /* undefined coalition members should be the first to jetsam */
8796 ntasks = coalition_get_pid_list (leaders[i].msi_coal, COALITION_ROLEMASK_UNDEF,
8797 coal_sort_order, pid_list, MAX_SORT_PIDS);
8798
8799 if (ntasks > 0) {
8800 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
8801 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
8802 }
8803
8804 #if 0
8805 if (pids_moved == leaders[i].msi_ntasks) {
8806 /*
8807 * All the pids in the coalition were found in this band.
8808 */
8809 printf("%s: pids_moved[%d] equal total coalition ntasks[%d] \n", __FUNCTION__,
8810 pids_moved, leaders[i].msi_ntasks);
8811 } else if (pids_moved > leaders[i].msi_ntasks) {
8812 /*
8813 * Apparently new coalition members showed up during the sort?
8814 */
8815 printf("%s: pids_moved[%d] were greater than expected coalition ntasks[%d] \n", __FUNCTION__,
8816 pids_moved, leaders[i].msi_ntasks);
8817 } else {
8818 /*
8819 * Apparently not all the pids in the coalition were found in this band?
8820 */
8821 printf("%s: pids_moved[%d] were less than expected coalition ntasks[%d] \n", __FUNCTION__,
8822 pids_moved, leaders[i].msi_ntasks);
8823 }
8824 #endif
8825
8826 total_pids_moved += pids_moved;
8827
8828 } /* end for */
8829
8830 return(total_pids_moved);
8831 }
8832
8833
8834 /*
8835 * Traverse a list of pids, searching for each within the priority band provided.
8836 * If pid is found, move it to the front of the priority band.
8837 * Never searches outside the priority band provided.
8838 *
8839 * Input:
8840 * bucket_index - jetsam priority band.
8841 * pid_list - pointer to a list of pids.
8842 * list_sz - number of pids in the list.
8843 *
8844 * Pid list ordering is important in that,
8845 * pid_list[n] is expected to jetsam ahead of pid_list[n+1].
8846 * The sort_order is set by the coalition default.
8847 *
8848 * Return:
8849 * the number of pids found and hence moved within the priority band.
8850 */
8851 static int
8852 memorystatus_move_list_locked(unsigned int bucket_index, pid_t *pid_list, int list_sz)
8853 {
8854 memstat_bucket_t *current_bucket;
8855 int i;
8856 int found_pids = 0;
8857
8858 if ((pid_list == NULL) || (list_sz <= 0)) {
8859 return(0);
8860 }
8861
8862 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
8863 return(0);
8864 }
8865
8866 current_bucket = &memstat_bucket[bucket_index];
8867 for (i=0; i < list_sz; i++) {
8868 unsigned int b = bucket_index;
8869 proc_t p = NULL;
8870 proc_t aProc = NULL;
8871 pid_t aPid;
8872 int list_index;
8873
8874 list_index = ((list_sz - 1) - i);
8875 aPid = pid_list[list_index];
8876
8877 /* never search beyond bucket_index provided */
8878 p = memorystatus_get_first_proc_locked(&b, FALSE);
8879 while (p) {
8880 if (p->p_pid == aPid) {
8881 aProc = p;
8882 break;
8883 }
8884 p = memorystatus_get_next_proc_locked(&b, p, FALSE);
8885 }
8886
8887 if (aProc == NULL) {
8888 /* pid not found in this band, just skip it */
8889 continue;
8890 } else {
8891 TAILQ_REMOVE(&current_bucket->list, aProc, p_memstat_list);
8892 TAILQ_INSERT_HEAD(&current_bucket->list, aProc, p_memstat_list);
8893 found_pids++;
8894 }
8895 }
8896 return(found_pids);
8897 }
8898
8899 int
8900 memorystatus_get_proccnt_upto_priority(int32_t max_bucket_index)
8901 {
8902 int32_t i = JETSAM_PRIORITY_IDLE;
8903 int count = 0;
8904
8905 if (max_bucket_index >= MEMSTAT_BUCKET_COUNT) {
8906 return(-1);
8907 }
8908
8909 while(i <= max_bucket_index) {
8910 count += memstat_bucket[i++].count;
8911 }
8912
8913 return count;
8914 }
8915
8916 int
8917 memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap)
8918 {
8919 #if !CONFIG_JETSAM
8920 if (!p || (!isApp(p)) || (p->p_memstat_state & P_MEMSTAT_INTERNAL)) {
8921 /*
8922 * Ineligible processes OR system processes e.g. launchd.
8923 */
8924 return -1;
8925 }
8926
8927 /*
8928 * For macOS only:
8929 * We would like to use memorystatus_update() here to move the processes
8930 * within the bands. Unfortunately memorystatus_update() calls
8931 * memorystatus_update_priority_locked() which uses any band transitions
8932 * as an indication to modify ledgers. For that it needs the task lock
8933 * and since we came into this function with the task lock held, we'll deadlock.
8934 *
8935 * Unfortunately we can't completely disable ledger updates because we still
8936 * need the ledger updates for a subset of processes i.e. daemons.
8937 * When all processes on all platforms support memory limits, we can simply call
8938 * memorystatus_update().
8939
8940 * It also has some logic to deal with 'aging' which, currently, is only applicable
8941 * on CONFIG_JETSAM configs. So, till every platform has CONFIG_JETSAM we'll need
8942 * to do this explicit band transition.
8943 */
8944
8945 memstat_bucket_t *current_bucket, *new_bucket;
8946 int32_t priority = 0;
8947
8948 proc_list_lock();
8949
8950 if (((p->p_listflag & P_LIST_EXITED) != 0) ||
8951 (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED))) {
8952 /*
8953 * If the process is on its way out OR
8954 * jetsam has alread tried and failed to kill this process,
8955 * let's skip the whole jetsam band transition.
8956 */
8957 proc_list_unlock();
8958 return(0);
8959 }
8960
8961 if (is_appnap) {
8962 current_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
8963 new_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
8964 priority = JETSAM_PRIORITY_IDLE;
8965 } else {
8966 if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
8967 /*
8968 * It is possible that someone pulled this process
8969 * out of the IDLE band without updating its app-nap
8970 * parameters.
8971 */
8972 proc_list_unlock();
8973 return (0);
8974 }
8975
8976 current_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
8977 new_bucket = &memstat_bucket[p->p_memstat_requestedpriority];
8978 priority = p->p_memstat_requestedpriority;
8979 }
8980
8981 TAILQ_REMOVE(&current_bucket->list, p, p_memstat_list);
8982 current_bucket->count--;
8983
8984 TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
8985 new_bucket->count++;
8986
8987 /*
8988 * Record idle start or idle delta.
8989 */
8990 if (p->p_memstat_effectivepriority == priority) {
8991 /*
8992 * This process is not transitioning between
8993 * jetsam priority buckets. Do nothing.
8994 */
8995 } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
8996 uint64_t now;
8997 /*
8998 * Transitioning out of the idle priority bucket.
8999 * Record idle delta.
9000 */
9001 assert(p->p_memstat_idle_start != 0);
9002 now = mach_absolute_time();
9003 if (now > p->p_memstat_idle_start) {
9004 p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
9005 }
9006 } else if (priority == JETSAM_PRIORITY_IDLE) {
9007 /*
9008 * Transitioning into the idle priority bucket.
9009 * Record idle start.
9010 */
9011 p->p_memstat_idle_start = mach_absolute_time();
9012 }
9013
9014 p->p_memstat_effectivepriority = priority;
9015
9016 proc_list_unlock();
9017
9018 return (0);
9019
9020 #else /* !CONFIG_JETSAM */
9021 #pragma unused(p)
9022 #pragma unused(is_appnap)
9023 return -1;
9024 #endif /* !CONFIG_JETSAM */
9025 }